1 Notes

This report was generated on 2022-09-20 14:35:42. R version: 4.2.0 on x86_64-apple-darwin17.0. For this report, CRAN packages as of 2022-05-01 were used.

1.1 R-Script & data

The preprocessing and analysis of the data was conducted in the R project for statistical computing. The RMarkdown script used to generate this document and all the resulting data can be downloaded under this link. Through executing main.Rmd, the herein described process can be reproduced and this document can be generated. In the course of this, data from the folder input will be processed and results will be written to output. The html on-line version of the analysis can be accessed through this link.

1.2 GitHub

The code for the herein described process can also be freely downloaded from https://github.com/fernandomillanvillalobos/r-data-manipulation.

1.3 License

1.4 Data description of output files

1.4.0.1 abc.csv (Example)

Attribute Type Description
a Numeric
b Numeric
c Numeric

2 Set up

## [1] "package package:rmarkdown detached"

2.1 Define packages

# from https://mran.revolutionanalytics.com/web/packages/\
# checkpoint/vignettes/using-checkpoint-with-knitr.html
# if you don't need a package, remove it from here (commenting not sufficient)
# tidyverse: see https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/
cat("
library(rstudioapi)
library(tidyverse)
library(data.table)
library(tidylog)
library(jsonlite)
library(lintr)
library(rmarkdown)
library(rio)
library(cowplot)
library(patchwork)
library(extrafont)
library(ggrepel)
library(pacman)
library(htmltab)
library(rmiscutils)
library(RSQLite)
library(fs)
library(openxlsx)
library(waldo)
library(vcdExtra)
library(psych)
library(Hmisc)
library(skimr)
library(gapminder)
library(lsr)
library(chron)
library(plm)
library(randomNames)
library(encryptr)
library(robotstxt)
library(tidymodels)
library(janitor)",
file = "manifest.R")

2.2 Install packages

# if checkpoint is not yet installed, install it (for people using this
# system for the first time)
if (!require(checkpoint)) {
  if (!require(devtools)) {
    install.packages("devtools", repos = "http://cran.us.r-project.org")
    require(devtools)
  }
  devtools::install_github("RevolutionAnalytics/checkpoint",
                           ref = "v0.3.2", # could be adapted later,
                           # as of now (beginning of July 2017
                           # this is the current release on CRAN)
                           repos = "http://cran.us.r-project.org")
  require(checkpoint)
}
# nolint start
if (!dir.exists("~/.checkpoint")) {
  dir.create("~/.checkpoint")
}
# nolint end
# install packages for the specified CRAN snapshot date
checkpoint(snapshot_date = package_date,
           project = path_to_wd,
           verbose = T,
           scanForPackages = T,
           use.knitr = F,
           R.version = r_version)
rm(package_date)

2.3 Load packages

source("manifest.R")
unlink("manifest.R")
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] C/UTF-8/C/C/C/C
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] janitor_2.1.0       yardstick_0.0.9     workflowsets_0.2.1 
##  [4] workflows_0.2.6     tune_0.2.0          rsample_0.1.1      
##  [7] recipes_0.2.0       parsnip_0.2.1       modeldata_0.1.1    
## [10] infer_1.0.0         dials_0.1.1         scales_1.2.0       
## [13] broom_0.8.0         tidymodels_0.2.0    robotstxt_0.7.13   
## [16] encryptr_0.1.3      randomNames_1.5-0.0 plm_2.6-1          
## [19] chron_2.3-56        lsr_0.5.2           gapminder_0.3.0    
## [22] skimr_2.1.4         Hmisc_4.7-1         Formula_1.2-4      
## [25] survival_3.4-0      lattice_0.20-45     psych_2.2.3        
## [28] vcdExtra_0.8-0      gnm_1.1-2           vcd_1.4-9          
## [31] waldo_0.4.0         openxlsx_4.2.5      fs_1.5.2           
## [34] RSQLite_2.2.13      rmiscutils_0.2      htmltab_0.8.2      
## [37] pacman_0.5.1        ggrepel_0.9.1       extrafont_0.18     
## [40] patchwork_1.1.2     cowplot_1.1.1       rio_0.5.29         
## [43] rmarkdown_2.14      lintr_2.0.1         jsonlite_1.8.0     
## [46] tidylog_1.0.2       data.table_1.14.2   forcats_0.5.2      
## [49] stringr_1.4.1       dplyr_1.0.9         purrr_0.3.4        
## [52] readr_2.1.2         tidyr_1.2.0         tibble_3.1.8       
## [55] ggplot2_3.3.6       tidyverse_1.3.1     checkpoint_1.0.2   
## [58] rstudioapi_0.14     knitr_1.40         
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2          tidyselect_1.1.2    htmlwidgets_1.5.4  
##   [4] pROC_1.18.0         miscTools_0.6-26    munsell_0.5.0      
##   [7] codetools_0.2-18    interp_1.1-3        future_1.25.0      
##  [10] withr_2.5.0         colorspace_2.0-3    Rttf2pt1_1.3.10    
##  [13] ca_0.71.1           listenv_0.8.0       Rdpack_2.3         
##  [16] repr_1.1.4          mnormt_2.1.0        bit64_4.0.5        
##  [19] DiceDesign_1.9      rprojroot_2.0.3     parallelly_1.31.1  
##  [22] vctrs_0.4.1         generics_0.1.3      ipred_0.9-12       
##  [25] xfun_0.32           R6_2.5.1            rex_1.2.1          
##  [28] lhs_1.1.5           cachem_1.0.6        assertthat_0.2.1   
##  [31] nnet_7.3-17         gtable_0.3.0        globals_0.14.0     
##  [34] processx_3.7.0      sandwich_3.0-1      timeDate_3043.102  
##  [37] rlang_1.0.4         clisymbols_1.2.0    cyclocomp_1.1.0    
##  [40] splines_4.2.0       extrafontdb_1.0     lazyeval_0.2.2     
##  [43] checkmate_2.1.0     yaml_2.3.5          modelr_0.1.8       
##  [46] backports_1.4.1     tools_4.2.0         collapse_1.7.6     
##  [49] lava_1.6.10         ellipsis_0.3.2      jquerylib_0.1.4    
##  [52] RColorBrewer_1.1-3  plyr_1.8.7          Rcpp_1.0.9         
##  [55] base64enc_0.1-3     ps_1.7.1            rpart_4.1.16       
##  [58] deldir_1.0-6        zoo_1.8-10          haven_2.5.1        
##  [61] cluster_2.1.4       furrr_0.3.1         magrittr_2.0.3     
##  [64] GPfit_1.0-8         lmtest_0.9-40       reprex_2.0.1       
##  [67] hms_1.1.2           evaluate_0.16       jpeg_0.1-9         
##  [70] readxl_1.4.1        gridExtra_2.3       testthat_3.1.4     
##  [73] compiler_4.2.0      bdsmatrix_1.3-4     crayon_1.5.1       
##  [76] htmltools_0.5.3     tzdb_0.3.0          lubridate_1.8.0    
##  [79] DBI_1.1.3           dbplyr_2.1.1        MASS_7.3-58.1      
##  [82] toOrdinal_1.3-0.0   relimp_1.0-5        Matrix_1.4-1       
##  [85] brio_1.1.3          cli_3.3.0           rbibutils_2.2.8    
##  [88] parallel_4.2.0      gower_1.0.0         pkgconfig_2.0.3    
##  [91] foreign_0.8-82      foreach_1.5.2       xml2_1.3.3         
##  [94] bslib_0.4.0         hardhat_0.2.0       prodlim_2019.11.13 
##  [97] rvest_1.0.2         snakecase_0.11.0    callr_3.7.2        
## [100] digest_0.6.29       cellranger_1.1.0    htmlTable_2.4.1    
## [103] maxLik_1.5-2        curl_4.3.2          lifecycle_1.0.1    
## [106] nlme_3.1-159        desc_1.4.1          qvcalc_1.0.2       
## [109] fansi_1.0.3         pillar_1.8.1        fastmap_1.1.0      
## [112] httr_1.4.4          glue_1.6.2          remotes_2.4.2      
## [115] zip_2.2.0           iterators_1.0.14    png_0.1-7          
## [118] bit_4.0.4           class_7.3-20        stringi_1.7.8      
## [121] sass_0.4.2          blob_1.2.3          latticeExtra_0.6-30
## [124] memoise_2.0.1       future.apply_1.9.0

2.4 Load additional scripts

# if you want to outsource logic to other script files, see README for 
# further information
# Load all visualizations functions as separate scripts
knitr::read_chunk("scripts/dviz.supp.R")
source("scripts/dviz.supp.R")
knitr::read_chunk("scripts/themes.R")
source("scripts/themes.R")
knitr::read_chunk("scripts/plot_grid.R")
source("scripts/plot_grid.R")
knitr::read_chunk("scripts/align_legend.R")
source("scripts/align_legend.R")
knitr::read_chunk("scripts/label_log10.R")
source("scripts/label_log10.R")
knitr::read_chunk("scripts/outliers.R")
source("scripts/outliers.R")

3 Import and export data with R

There are some different ways to do this, depending on how your data is formatted and where it’s located.

3.1 Base-R

3.1.1 Entering data using R commands

R provides a nice GUI for editing tabular data: the data editor.

The c function has already been introduced as a way to input small amounts of data into R. When the amount of data is large, and especially when typing the data into the console is inappropriate, the scan function can be used. scan is most appropriate when all the data to be read is of the same mode, so that it can be accommodated by a vector or matrix. The first argument to scan can be a quoted string or character variable containing the name of a file or a URL, or it can be any of a number of connections to allow other input sources. If no first argument is given, scan will read from the console, stopping when a completely blank line is entered. By default, scan expects all of its input to be numeric data; this can be overridden with the what= argument, which specifies the type of data that scan will see.

When reading from the console, R will prompt you with the index of the next item to be entered, and report on the number of elements read when it’s done. If the what= argument to scan is a list containing examples of the expected data types, scan will output a list with as many elements as there are data types provided. To specify numeric values, you can pass a value of 0.

Note that, by naming the elements in the list passed through the what= argument, the output list elements are appropriately named. When the argument to what= is a list, the multi.line= option can be set to FALSE to prevent scan from trying to use multiple lines to read the records for an observation. One of the most common uses for scan is to read in data matrices. Since scan returns a vector, a call to scan can be embedded inside a call to the matrix function.

salary <- c(18700000, 14626720, 14137500, 13980000, 12916666)
position <- c("QB", "QB", "DE", "QB", "QB")
team <- c("Colts", "Patriots", "Panthers", "Bengals", "Giants")
name.last <- c("Manning", "Brady", "Pepper", "Palmer", "Manning")
name.first <- c("Peyton", "Tom", "Julius", "Carson", "Eli")
top.5.salaries <- data.frame(name.last, name.first, team, position, salary)
top.5.salaries
##   name.last name.first     team position   salary
## 1   Manning     Peyton    Colts       QB 18700000
## 2     Brady        Tom Patriots       QB 14626720
## 3    Pepper     Julius Panthers       DE 14137500
## 4    Palmer     Carson  Bengals       QB 13980000
## 5   Manning        Eli   Giants       QB 12916666
# calling the built-in data editor
# top.5.salaries <- edit(top.5.salaries)
# fix(top.5.salaries)

# using scan
# names <- scan(what = "")
# names
# names2 = scan(what=list(a=0,b="",c=0))
# names2

# creating a matrix
# mymat <- matrix(scan(), ncol = 3, byrow = TRUE)
# mymat

3.1.2 Importing data from external files

R includes a family of functions for importing delimited text files into R, based on the read.table function. The read.table function reads a text file into R and returns a data.frame object. Each row in the input file is interpreted as an observation. Each column in the input file represents a variable. The read.table function expects each field to be separated by a delimiter. The most important options are sep and header. R includes a set of convenience functions that call read.table with different default options for these values. Besides that, you can fetch a CSV file from a single URL.

read.table options

Although not as common as white-space-, tab-, or comma-separated data, sometimes input data is stored with no delimiters between the values, but with each variable occupying the same columns on each line of input. In cases like this, the read.fwf function can be used. The widths= argument can be a vector containing the widths of the fields to be read, using negative numbers to indicate columns to be skipped. If the data for each observation occupies more than one line, widths= can be a list of as many vectors as there are lines per observation. The header=, row.names=, and col.names= arguments behave similarly to those in read.table.

snowdata <- read.table("input/BostonWinterSnowfalls.csv", header = TRUE, sep = ",", quote = "\"")

# getting data online
sp500 <- read.csv("http://bit.ly/BostonSnowfallCSV", sep="")

# getting data with no delimiters
ff <- tempfile()
cat(file = ff, "New York, NY 66,834.6 
    Kings, NY 34,722.9 
    Bronx, NY 31,729.8 
    Queens, NY 20,453.0 
    San Francisco, CA 16,526.2 
    Hudson, NJ 12,956.9 
    Suffolk, MA 11,691.6 
    Philadelphia, PA 11,241.1 
    Washington, DC 9,378.0 
    Alexandria IC, VA 8,552.2")
city <- read.fwf(ff, widths = c(18, -19, 8), as.is = TRUE)
city
##                    V1 V2
## 1  New York, NY 66,83 NA
## 2      Kings, NY 34,7 NA
## 3      Bronx, NY 31,7 NA
## 4      Queens, NY 20, NA
## 5      San Francisco, NA
## 6      Hudson, NJ 12, NA
## 7      Suffolk, MA 11 NA
## 8      Philadelphia,  NA
## 9      Washington, DC NA
## 10     Alexandria IC, NA

3.1.3 Exporting data

R can also export R data objects (usually data frames and matrices) as text files. To export data to a text file, use the write.table function.There are wrapper functions for write.table that call write.table with different defaults. These are useful if you want to create a file of comma-separated values.

# write.table(snowdata, file = "output/snowdata.txt", quote = FALSE, sep = ",", row.names = FALSE)
# write.csv(snowdata, file = "output/snowdata.csv", row.names = FALSE)

3.1.4 Importing data from databases

In order to connect directly to a database from R, you will need to install some optional packages. The packages you need depend on the database(s) to which you want to connect and the connection method you want to use.

There are two sets of database interfaces available in R:

  • RODBC. The RODBC package allows R to fetch data from ODBC (Open DataBase Connectivity) connections. ODBC provides a standard interface for different programs to connect to databases.

  • DBI. The DBI package allows R to connect to databases using native database drivers or JDBC drivers. This package provides a common database abstraction for R software.

DBI is not a single package, but instead is a framework and set of packages for accessing databases.One important difference between the DBI packages and the RODBC package is in the objects they use: DBI uses S4 objects to represent drivers, connections, and other objects. To open a connection with DBI, use the dbConnect function. The argument drv can be a DBIDriver object or a character value describing the driver to use. You can generate a DBIDriver object with a call to the DBI driver. The dbConnect function can take additional options, depending on the type of database you are using. For SQLite databases, the most important argument is dbname (which specifies the database file). Check the help files for the database you are using for more options. Even arguments for parameters like usernames are not the same between databases.

As example we can easily copy an R data frame into a SQLite database with dbWriteTable().

# to connect with an external database
# drv <- dbDriver("SQLite")
# con <- dbConnect(drv, dbname = system.file("extdata", "bb.db", package = "nutshell"))

# creating our database
mydb <- dbConnect(RSQLite::SQLite(), "")
dbWriteTable(mydb, "mtcars", mtcars)
dbWriteTable(mydb, "iris", iris)
dbListTables(mydb)
## [1] "iris"   "mtcars"
# Issue a query with dbGetQuery()
dbGetQuery(mydb, 'SELECT * FROM mtcars LIMIT 5')
##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
# disconnecting from dabase
dbDisconnect(mydb)

3.2 Tidyverse

3.2.1 Reading data

El paquete readr, incluido en la familia tidyverse, admite la lectura de múltiples formatos de archivo usando funciones que comienzan por read_* o write_*. Por defecto, la función read_excel() importa la primera hoja. Para importar una hoja diferente es necesario indicarlo con el argumento sheet o bien el número o el nombre (segundo argumento). La función más importante para leer múltiples hojas es map() del paquete purrr que forma para de la colección de paquetes tidyverse. map() permite aplicar una función a cada elemento de un vector o lista. Existe una variante de map() que directamente nos une todas las tablas por fila: map_df(). Si fuese necesario unir por columna, se debería usar map_dfc().

La función dir_ls() del paquete fs (https://github.com/r-lib/fs), a pesar de ser similar a dir() de R Base, tiene algunas ventajas, como su total compatibilidad con la colección de funciones de tidyverse. Por ejemplo, el argumento regexp permite buscar con una expresión regular un patrón en las rutas y ficheros.

Funciones en readr

Most of us would probably read the CSV file first and then do the data cleaning. For example, using the clean_names function from the janitor package. The same can be achieved inside read_csv with the function make_clean_names for the name_repair argument. The function uses the snake naming convention by default. Snake converts all names to lowercase and separates words with an underscore. Besides that, clean_names does not work with vectors, but make_clean_names does.

List of all naming conventions

With make_clean_names you can also replace certain characters from the column names. If you are familiar with regular expressions, you can make more complex replacements. Apart from cleaning your column names, you can also select columns directly from read_csv using the col_select argument.

# janitor approach
mpg_new <- read_csv("input/mpg_uppercase.csv", show_col_types = FALSE) %>%
  janitor::clean_names() %>%
  select(c(manufacturer, model)) %>%
  glimpse()
## Rows: 6
## Columns: 2
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi"
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4"
# tidyverse approach
read_csv("input/mpg_uppercase.csv", name_repair = make_clean_names, show_col_types = FALSE) %>%
  glimpse()
## Rows: 6
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi"
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4"
## $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8
## $ year         <dbl> 1999, 1999, 2008, 2008, 1999, 1999
## $ cyl          <dbl> 4, 4, 4, 4, 6, 6
## $ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ drv          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
## $ cty          <dbl> 18, 21, 20, 21, 16, 18
## $ hwy          <dbl> 29, 29, 31, 30, 26, 26
## $ fl           <chr> "p", "p", "p", "p", "p", "p"
## $ class        <chr> "compact", "compact", "compact", "compact", "compact", "c…
# replacing and removing character strings with make_clean_names
make_clean_names(c("A", "B%", "C"), replace = c("%" = "_percent"))
## [1] "a"         "b_percent" "c"
# with reg expressions
make_clean_names(c("A_1", "B_1", "C_1"), replace = c("^A_" = "a"))
## [1] "a1"  "b_1" "c_1"
# snake naming convention per default
make_clean_names(c("myHouse", "MyGarden"), case = "snake")
## [1] "my_house"  "my_garden"
make_clean_names(c("myHouse", "MyGarden"), case = "none")
## [1] "myHouse"  "MyGarden"
read_csv("input/mpg_uppercase.csv", show_col_types = FALSE, name_repair = ~ make_clean_names(., case = "upper_camel")) %>% # The dot . in make_clean_names denotes the vector of column names. 
  glimpse()
## Rows: 6
## Columns: 11
## $ Manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi"
## $ Model        <chr> "a4", "a4", "a4", "a4", "a4", "a4"
## $ Displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8
## $ Year         <dbl> 1999, 1999, 2008, 2008, 1999, 1999
## $ Cyl          <dbl> 4, 4, 4, 4, 6, 6
## $ Trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ Drv          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
## $ Cty          <dbl> 18, 21, 20, 21, 16, 18
## $ Hwy          <dbl> 29, 29, 31, 30, 26, 26
## $ Fl           <chr> "p", "p", "p", "p", "p", "p"
## $ Class        <chr> "compact", "compact", "compact", "compact", "compact", "c…
# selecting specific columns
read_csv("input/mpg_uppercase.csv", show_col_types = FALSE, name_repair = make_clean_names, col_select = c(manufacturer, model)) %>% 
  glimpse()
## Rows: 6
## Columns: 2
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi"
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4"

Column names often contain spaces, special characters, or are written in a mixture of lower and upper case characters. Such poorly formatted column names can lead to numerous problems. We could easily solve these issues with the rename function but this approach does not scale. The main difference between remain and remain_with is that remain_with changes the column names using a function. The three main arguments of the function are .data, .fn and .cols. .data stands for the data frame, .fn for the function to apply to the column names, and .cols for the columns to apply the function to.

Use a specific naming convention for column names using the make_clean_names function from the janitor package. It is used the tilde operator to indicate an anonymous function. This shortcut is needed whenever you need to call certain arguments of a function.

Another use case of rename_with is the replacement of characters. We use the gsub function to replace a specific character. Alternatively, we could have used the str_replace function. With pattern we said that we are looking for a group of characters containing one or more digits (\d+). \d+ is a regular expression. A group in the argument pattern is everything between two brackets. With replacement we said that we want to put an underscore in front of this group. The group itself is specified by \1. If we had two groups, the second group would be specified by \2.

You can use .cols to specify which column names to apply the function to. And you can even use our tidyselect functions for that. Another useful function is matches. With matches, you can search for specific patterns in your column names and apply a function to the column names that match the pattern.

mpg %>%
  rename_with(
    .fn = toupper,
    .cols = everything()
  )
## # A tibble: 234 × 11
##    MANUFACTURER MODEL      DISPL  YEAR   CYL TRANS DRV     CTY   HWY FL    CLASS
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto… f        18    29 p     comp…
##  2 audi         a4           1.8  1999     4 manu… f        21    29 p     comp…
##  3 audi         a4           2    2008     4 manu… f        20    31 p     comp…
##  4 audi         a4           2    2008     4 auto… f        21    30 p     comp…
##  5 audi         a4           2.8  1999     6 auto… f        16    26 p     comp…
##  6 audi         a4           2.8  1999     6 manu… f        18    26 p     comp…
##  7 audi         a4           3.1  2008     6 auto… f        18    27 p     comp…
##  8 audi         a4 quattro   1.8  1999     4 manu… 4        18    26 p     comp…
##  9 audi         a4 quattro   1.8  1999     4 auto… 4        16    25 p     comp…
## 10 audi         a4 quattro   2    2008     4 manu… 4        20    28 p     comp…
## # … with 224 more rows
# the same as
mpg %>%
  rename_with(
    .fn = toupper,
    .cols = everything()
  ) %>%
  colnames()
##  [1] "MANUFACTURER" "MODEL"        "DISPL"        "YEAR"         "CYL"         
##  [6] "TRANS"        "DRV"          "CTY"          "HWY"          "FL"          
## [11] "CLASS"
# using janitor to build a specific name convention
iris %>%
  rename_with(~ janitor::make_clean_names(., case = "big_camel")) %>%
  colnames()
## [1] "SepalLength" "SepalWidth"  "PetalLength" "PetalWidth"  "Species"
# replacing characters
mpg %>%
  rename_with(~ gsub("e", "_", .)) %>%
  colnames()
##  [1] "manufactur_r" "mod_l"        "displ"        "y_ar"         "cyl"         
##  [6] "trans"        "drv"          "cty"          "hwy"          "fl"          
## [11] "class"
# or
mpg %>%
  rename_with(~ str_replace(., "e", "_")) %>%
  colnames()
##  [1] "manufactur_r" "mod_l"        "displ"        "y_ar"         "cyl"         
##  [6] "trans"        "drv"          "cty"          "hwy"          "fl"          
## [11] "class"
# replacing characters using grouping function
anscombe %>%
  rename_with(~ str_replace(.,
    pattern = "(\\d+)",
    replacement = "_\\1"
  )) %>%
  colnames()
## [1] "x_1" "x_2" "x_3" "x_4" "y_1" "y_2" "y_3" "y_4"
# renaming variables for specific variables
anscombe %>%
  rename_with(~ str_replace(
    ., "([:alpha:])([1-2])",
    "\\1psilon\\2_"
  ), c(y1, y2)) %>%
  colnames()
## [1] "x1"        "x2"        "x3"        "x4"        "ypsilon1_" "ypsilon2_"
## [7] "y3"        "y4"
# using tidyverse functions
mpg %>%
  rename_with(~ toupper(.), where(is.numeric)) %>%
  colnames()
##  [1] "manufacturer" "model"        "DISPL"        "YEAR"         "CYL"         
##  [6] "trans"        "drv"          "CTY"          "HWY"          "fl"          
## [11] "class"
iris %>%
  rename_with(
    ~ str_replace(., "\\.", "_"),
    starts_with("Sepal")
  ) %>%
  colnames()
## [1] "Sepal_Length" "Sepal_Width"  "Petal.Length" "Petal.Width"  "Species"
# using matches
iris %>%
  rename_with(
    ~ str_replace(., "\\.", "_"),
    matches("[Ww]idth$")
  ) %>%
  colnames()
## [1] "Sepal.Length" "Sepal_Width"  "Petal.Length" "Petal_Width"  "Species"

You don”t always read just one file into R. It is not uncommon for your data to be scattered in hundreds or thousands of files. Of course, you don”t want to read these files into R manually. So you need an automatic method for reading in files. Before we can read the files into R, we need to create a character vector of the file paths. You have several options to create such a vector. You can use the R base function list.files, which returns character vectors of the names of files in a directory or you use the function dir_ls from the fs package. The other option is to use the dir_ls function from the fs package. fs provides a cross-platform interface for accessing files on your hard disk. It supports all file operations (deleting, creating files, moving files, etc.).

Now that we know the file paths, we can load the files into R. The tidyverse way to do this is to use the map_dfr function from the purrr package. map_dfr loops through all the file paths and binds the data frames into a single data frame. The .x in the following code stands for the file name. To output the actual csv files and not the filenames, we need to put .x (the path) in a read_* function. In this example we are working with CSV files. The trick works the same for all rectangular file formats. Another approach is to use the read_csv function directly by putting the character vector of the file names directly into read_csv.

Sometimes your files are deeply nested. In that case, we need to search through each folder recursively. If you try to load all csv files from the nested_folders folder, you would get an empty vector. This is because dir_ls does not look in the nested folders, but only in the parent folder. To make dir_ls search through the folders recursively, you need to set the recurse argument to TRUE.

You don”t always need all the files in your directory and need to remove some files from the list of file paths. A good way to do this is to use the str_detect function from the stringr package. The function returns logical values. To change the actual character vector, we need to add these logical values to the character vector itself. With the negate argument you can find only the files that do not match the pattern.

horas_sol <- read_csv("input/SS_STAID001395.txt", skip = 19) |> # los datos empiezan en la linea 20
  janitor::clean_names()
head(horas_sol)
## # A tibble: 6 × 4
##    souid     date    ss  q_ss
##    <dbl>    <dbl> <dbl> <dbl>
## 1 120414 19560501 -9999     9
## 2 120414 19560502 -9999     9
## 3 120414 19560503 -9999     9
## 4 120414 19560504 -9999     9
## 5 120414 19560505 -9999     9
## 6 120414 19560506 -9999     9
# .xlsx files
# importing .xls file
emisiones <- readxl::read_xls("input/env_air_gge.xls", sheet = 1, skip = 362, n_max = 36)
head(emisiones)
## # A tibble: 6 × 11
##   GEO/TI…¹ `2007` `2008` `2009` `2010` `2011` `2012` `2013` `2014` `2015` `2016`
##   <chr>     <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 Europea… 4.33e5 4.30e5 4.25e5 4.20e5 4.20e5 4.18e5 4.21e5 4.28e5 4.30e5 4.30e5
## 2 Europea… 4.34e5 4.31e5 4.26e5 4.21e5 4.21e5 4.19e5 4.22e5 4.29e5 4.30e5 4.31e5
## 3 Belgium  1.03e4 1.02e4 1.03e4 1.02e4 1.01e4 9.92e3 9.96e3 1.02e4 1.01e4 9.90e3
## 4 Bulgaria 4.90e3 5.15e3 4.98e3 5.45e3 5.11e3 5.24e3 5.72e3 6.19e3 6.24e3 6.53e3
## 5 Czech R… 7.84e3 7.99e3 7.58e3 7.41e3 7.59e3 7.58e3 7.76e3 7.96e3 8.16e3 8.52e3
## 6 Denmark  1.08e4 1.07e4 1.05e4 1.04e4 1.04e4 1.04e4 1.03e4 1.05e4 1.04e4 1.05e4
## # … with abbreviated variable name ¹​`GEO/TIME`
# iterate over multiple worksheets in a workbook
path <- "input/madrid_temp.xlsx"
mad <- path %>%
  readxl::excel_sheets() %>%
  set_names() %>%
  map_df(readxl::read_excel,
    path = path, .id = "yr"
  )
head(mad)
## # A tibble: 6 × 3
##   yr    date                   ta
##   <chr> <dttm>              <dbl>
## 1 2000  2000-01-01 00:00:00   5.4
## 2 2000  2000-01-02 00:00:00   5  
## 3 2000  2000-01-03 00:00:00   3.5
## 4 2000  2000-01-04 00:00:00   4.3
## 5 2000  2000-01-05 00:00:00   0.6
## 6 2000  2000-01-06 00:00:00   3.8
# importing and reading several .xlsx files at once without merging
dir_ls("input", regexp = "xlsx") %>%
  map(readxl::read_excel)
## $`input/berlin_temp.xlsx`
## # A tibble: 366 × 2
##    date                   ta
##    <dttm>              <dbl>
##  1 2000-01-01 00:00:00   1.2
##  2 2000-01-02 00:00:00   3.6
##  3 2000-01-03 00:00:00   5.7
##  4 2000-01-04 00:00:00   5.1
##  5 2000-01-05 00:00:00   2.2
##  6 2000-01-06 00:00:00   1.8
##  7 2000-01-07 00:00:00   4.2
##  8 2000-01-08 00:00:00   4.2
##  9 2000-01-09 00:00:00   4.2
## 10 2000-01-10 00:00:00   1.7
## # … with 356 more rows
## 
## $`input/madrid_temp.xlsx`
## # A tibble: 366 × 2
##    date                   ta
##    <dttm>              <dbl>
##  1 2000-01-01 00:00:00   5.4
##  2 2000-01-02 00:00:00   5  
##  3 2000-01-03 00:00:00   3.5
##  4 2000-01-04 00:00:00   4.3
##  5 2000-01-05 00:00:00   0.6
##  6 2000-01-06 00:00:00   3.8
##  7 2000-01-07 00:00:00   6.2
##  8 2000-01-08 00:00:00   5.4
##  9 2000-01-09 00:00:00   5.5
## 10 2000-01-10 00:00:00   4.8
## # … with 356 more rows
# merging into a new column
data_df <- dir_ls("input", regexp = "xlsx") %>% 
  map_df(readxl::read_excel, .id = "city")
# cleaning city column
data_df <- mutate(data_df, city = path_file(city) %>% 
                    path_ext_remove() %>% 
                    str_replace("_temp", ""))
head(data_df)
## # A tibble: 6 × 3
##   city   date                   ta
##   <chr>  <dttm>              <dbl>
## 1 berlin 2000-01-01 00:00:00   1.2
## 2 berlin 2000-01-02 00:00:00   3.6
## 3 berlin 2000-01-03 00:00:00   5.7
## 4 berlin 2000-01-04 00:00:00   5.1
## 5 berlin 2000-01-05 00:00:00   2.2
## 6 berlin 2000-01-06 00:00:00   1.8
# .csv files
# adding new directory
# dir_create("input", c("many_files"))

# creating random samples from mpg data set
# mpg_samples <- map(1:25, ~ slice_sample(mpg, n = 20))

# adding .csv files from samples to the new directory
# iwalk(mpg_samples, ~ write_csv(., paste0("input/many_files/", .y, ".csv")))

# creating a character vector of file paths
# with list.files from Base-R
(csv_files_list_files <- list.files(path = "input/many_files", pattern = "csv", full.names = TRUE))
##  [1] "input/many_files/1.csv"  "input/many_files/10.csv"
##  [3] "input/many_files/11.csv" "input/many_files/12.csv"
##  [5] "input/many_files/13.csv" "input/many_files/14.csv"
##  [7] "input/many_files/15.csv" "input/many_files/16.csv"
##  [9] "input/many_files/17.csv" "input/many_files/18.csv"
## [11] "input/many_files/19.csv" "input/many_files/2.csv" 
## [13] "input/many_files/20.csv" "input/many_files/21.csv"
## [15] "input/many_files/22.csv" "input/many_files/23.csv"
## [17] "input/many_files/24.csv" "input/many_files/25.csv"
## [19] "input/many_files/3.csv"  "input/many_files/4.csv" 
## [21] "input/many_files/5.csv"  "input/many_files/6.csv" 
## [23] "input/many_files/7.csv"  "input/many_files/8.csv" 
## [25] "input/many_files/9.csv"
# with dir_ls from fs package
(csv_files_dir_ls <- dir_ls(path = "input/many_files/", glob = "*.csv", type = "file"))
## input/many_files/1.csv  input/many_files/10.csv input/many_files/11.csv 
## input/many_files/12.csv input/many_files/13.csv input/many_files/14.csv 
## input/many_files/15.csv input/many_files/16.csv input/many_files/17.csv 
## input/many_files/18.csv input/many_files/19.csv input/many_files/2.csv  
## input/many_files/20.csv input/many_files/21.csv input/many_files/22.csv 
## input/many_files/23.csv input/many_files/24.csv input/many_files/25.csv 
## input/many_files/3.csv  input/many_files/4.csv  input/many_files/5.csv  
## input/many_files/6.csv  input/many_files/7.csv  input/many_files/8.csv  
## input/many_files/9.csv
# another example using map.df(), list.files() and rio::import()
contributions <- map_df(list.files("input/mayor_finance_reports", full.names = TRUE), rio::import)
head(contributions)
##        Date       Contributor           Address       City State   Zip
## 1 7/30/2017     Curt DeChicco              <NA>       <NA>    MA  <NA>
## 2 7/30/2017   Nicolle Eduardo              <NA>       <NA>    MA  <NA>
## 3 7/30/2017 Heidi Vasconcelos              <NA>       <NA>    MA  <NA>
## 4 7/30/2017       Robert Fair      656 Grove St Framingham    MA 01701
## 5 7/30/2017   Jonates Azevedo 2 Dell Ann Circle    Milford    MA 01757
## 6 7/30/2017 Horrigan Jennifer              <NA>       <NA>    MA  <NA>
##         Occupation Employer Amount             Recipient
## 1             <NA>     <NA>     40 Horrigan, Joshua Paul
## 2             <NA>     <NA>     20 Horrigan, Joshua Paul
## 3             <NA>     <NA>     20 Horrigan, Joshua Paul
## 4 Jewell Insurance     Self    300 Horrigan, Joshua Paul
## 5             <NA>     <NA>    100 Horrigan, Joshua Paul
## 6             <NA>     <NA>     20 Horrigan, Joshua Paul
# using janitor::tabyl() function to count number of rows within a group
contributions <- map_df(list.files("input/mayor_finance_reports", full.names = TRUE), rio::import) %>%
  filter(City == "Framingham", !str_detect(tolower(Address), "box")) %>%
  distinct(Contributor, Address, .keep_all = TRUE) %>%
  tabyl(Recipient, sort = TRUE) %>%
  # mutate(percent = round(percent * 100, 1)) %>% 
  select(Candidate = Recipient, Pct_Local_Contributors = percent)
contributions
##                   Candidate Pct_Local_Contributors
##       Horrigan, Joshua Paul            0.035820896
##  Neves-Grigg, Sr., Benjaman            0.011940299
##                 Sen, Dhruba            0.008955224
##             Sousa, Priscila            0.029850746
##       Spicer, Dr. Yvonne M.            0.516417910
##          Stefanini, John A.            0.337313433
##             Tilden, Mark S.            0.059701493
# using adorn_percentages()
results <- readr::read_csv("input/election_framingham_mayor_2017_09.csv", col_names = TRUE) %>% 
  dplyr::select(Candidate, Totals)
results
## # A tibble: 9 × 2
##   Candidate                Totals
##   <chr>                     <dbl>
## 1 Blanks                       56
## 2 Joshua Paul Horrigan        545
## 3 John A. Stefanini          3184
## 4 Dhruba P. Sen               101
## 5 Mark S. Tilden              439
## 6 Yvonne M. Spicer           5967
## 7 Benjaman A. Neves-Grigg,    134
## 8 Priscila Sousa              538
## 9 Write-Ins                    42
results <- results %>%
  filter(!(Candidate %in% c("Blanks", "Write-Ins"))) %>%
  adorn_percentages(denominator = "col") %>% 
   rename(Pct_Vote = Totals)
results
##                 Candidate    Pct_Vote
##      Joshua Paul Horrigan 0.049963330
##         John A. Stefanini 0.291895856
##             Dhruba P. Sen 0.009259259
##            Mark S. Tilden 0.040245691
##          Yvonne M. Spicer 0.547029703
##  Benjaman A. Neves-Grigg, 0.012284562
##            Priscila Sousa 0.049321599
contributions_split <- tidyr::separate(
  contributions, Candidate,
  c("LastName", "FirstName"), ", ", 2
) %>%
  select(-FirstName)
head(contributions_split)
##     LastName Pct_Local_Contributors
##     Horrigan            0.035820896
##  Neves-Grigg            0.011940299
##          Sen            0.008955224
##        Sousa            0.029850746
##       Spicer            0.516417910
##    Stefanini            0.337313433
results_split <- tidyr::separate(results, Candidate, c("FirstName", "MiddleName", "LastName"), " ")
tail(results_split)
## # A tibble: 6 × 4
##   FirstName MiddleName LastName     Pct_Vote
##   <chr>     <chr>      <chr>           <dbl>
## 1 John      A.         Stefanini     0.292  
## 2 Dhruba    P.         Sen           0.00926
## 3 Mark      S.         Tilden        0.0402 
## 4 Yvonne    M.         Spicer        0.547  
## 5 Benjaman  A.         Neves-Grigg,  0.0123 
## 6 Priscila  Sousa      <NA>          0.0493
results_split %<>%
  mutate(
    LastName = ifelse(is.na(LastName), MiddleName, LastName),
    LastName = str_replace(LastName, ",", "")
  ) %>%
  select(-FirstName, -MiddleName)
tail(results_split)
## # A tibble: 6 × 2
##   LastName    Pct_Vote
##   <chr>          <dbl>
## 1 Stefanini    0.292  
## 2 Sen          0.00926
## 3 Tilden       0.0402 
## 4 Spicer       0.547  
## 5 Neves-Grigg  0.0123 
## 6 Sousa        0.0493
# reading the files from a character vector of paths
data_frames <- map_dfr(csv_files_dir_ls, ~ read_csv(.x, show_col_types = FALSE))
glimpse(data_frames)
## Rows: 500
## Columns: 11
## $ manufacturer <chr> "volkswagen", "chevrolet", "volkswagen", "audi", "ford", …
## $ model        <chr> "jetta", "malibu", "jetta", "a4", "explorer 4wd", "camry …
## $ displ        <dbl> 2.0, 2.4, 2.0, 2.0, 4.0, 2.4, 5.7, 2.2, 2.8, 2.8, 1.8, 4.…
## $ year         <dbl> 1999, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 199…
## $ cyl          <dbl> 4, 4, 4, 4, 6, 4, 8, 4, 6, 6, 4, 8, 6, 8, 6, 6, 8, 6, 8, …
## $ trans        <chr> "auto(l4)", "auto(l4)", "auto(s6)", "auto(av)", "manual(m…
## $ drv          <chr> "f", "f", "f", "f", "4", "f", "4", "4", "4", "f", "f", "4…
## $ cty          <dbl> 19, 22, 22, 21, 15, 22, 13, 21, 17, 18, 26, 13, 16, 13, 1…
## $ hwy          <dbl> 26, 30, 29, 30, 19, 31, 18, 26, 25, 26, 35, 19, 24, 17, 2…
## $ fl           <chr> "r", "r", "p", "p", "r", "r", "r", "r", "p", "p", "r", "r…
## $ class        <chr> "compact", "midsize", "compact", "compact", "suv", "compa…
# and with a new column representing the file name
map_dfr(csv_files_dir_ls, ~ read_csv(.x, , show_col_types = FALSE) %>%
  mutate(filename = .x)) %>%
  glimpse()
## Rows: 500
## Columns: 12
## $ manufacturer <chr> "volkswagen", "chevrolet", "volkswagen", "audi", "ford", …
## $ model        <chr> "jetta", "malibu", "jetta", "a4", "explorer 4wd", "camry …
## $ displ        <dbl> 2.0, 2.4, 2.0, 2.0, 4.0, 2.4, 5.7, 2.2, 2.8, 2.8, 1.8, 4.…
## $ year         <dbl> 1999, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 199…
## $ cyl          <dbl> 4, 4, 4, 4, 6, 4, 8, 4, 6, 6, 4, 8, 6, 8, 6, 6, 8, 6, 8, …
## $ trans        <chr> "auto(l4)", "auto(l4)", "auto(s6)", "auto(av)", "manual(m…
## $ drv          <chr> "f", "f", "f", "f", "4", "f", "4", "4", "4", "f", "f", "4…
## $ cty          <dbl> 19, 22, 22, 21, 15, 22, 13, 21, 17, 18, 26, 13, 16, 13, 1…
## $ hwy          <dbl> 26, 30, 29, 30, 19, 31, 18, 26, 25, 26, 35, 19, 24, 17, 2…
## $ fl           <chr> "r", "r", "p", "p", "r", "r", "r", "r", "p", "p", "r", "r…
## $ class        <chr> "compact", "midsize", "compact", "compact", "suv", "compa…
## $ filename     <fs::path> "input/many_files/1.csv", "input/many_files/1.csv", …
# using directly read_csv
read_csv(csv_files_dir_ls, id = "filename", show_col_types = FALSE) %>% 
  glimpse
## Rows: 500
## Columns: 12
## $ filename     <chr> "input/many_files/1.csv", "input/many_files/1.csv", "inpu…
## $ manufacturer <chr> "volkswagen", "chevrolet", "volkswagen", "audi", "ford", …
## $ model        <chr> "jetta", "malibu", "jetta", "a4", "explorer 4wd", "camry …
## $ displ        <dbl> 2.0, 2.4, 2.0, 2.0, 4.0, 2.4, 5.7, 2.2, 2.8, 2.8, 1.8, 4.…
## $ year         <dbl> 1999, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 199…
## $ cyl          <dbl> 4, 4, 4, 4, 6, 4, 8, 4, 6, 6, 4, 8, 6, 8, 6, 6, 8, 6, 8, …
## $ trans        <chr> "auto(l4)", "auto(l4)", "auto(s6)", "auto(av)", "manual(m…
## $ drv          <chr> "f", "f", "f", "f", "4", "f", "4", "4", "4", "f", "f", "4…
## $ cty          <dbl> 19, 22, 22, 21, 15, 22, 13, 21, 17, 18, 26, 13, 16, 13, 1…
## $ hwy          <dbl> 26, 30, 29, 30, 19, 31, 18, 26, 25, 26, 35, 19, 24, 17, 2…
## $ fl           <chr> "r", "r", "p", "p", "r", "r", "r", "r", "p", "p", "r", "r…
## $ class        <chr> "compact", "midsize", "compact", "compact", "suv", "compa…
# inconsistent column names
# generating the samples with inconsistent column names
mpg_samples2 <- map(1:10, ~ slice_sample(mpg, n = 20))
inconsistent_dframes <- map(mpg_samples2, ~ janitor::clean_names(dat = .x, case = "random"))
map(inconsistent_dframes, ~ colnames(.x)) %>% 
  head
## [[1]]
##  [1] "ManUfactuREr" "MoDel"        "dIsPl"        "yeAR"         "CYl"         
##  [6] "TRans"        "DRv"          "CTY"          "HwY"          "Fl"          
## [11] "clASs"       
## 
## [[2]]
##  [1] "MANUfaCturER" "mODel"        "dIsPL"        "year"         "cYl"         
##  [6] "trans"        "DRv"          "CtY"          "hWY"          "FL"          
## [11] "cLaSs"       
## 
## [[3]]
##  [1] "MAnuFAcTuReR" "mOdEL"        "DIsPL"        "YEAR"         "cyl"         
##  [6] "tRaNs"        "Drv"          "cty"          "hWY"          "fl"          
## [11] "clASS"       
## 
## [[4]]
##  [1] "mAnuFAcTUreR" "mOdel"        "DiSpL"        "yeAr"         "cyL"         
##  [6] "TRanS"        "dRV"          "ctY"          "hWY"          "fl"          
## [11] "clAsS"       
## 
## [[5]]
##  [1] "MANUFaCTUReR" "ModEl"        "disPL"        "yEaR"         "Cyl"         
##  [6] "TrAns"        "DrV"          "Cty"          "HwY"          "fl"          
## [11] "cLASS"       
## 
## [[6]]
##  [1] "MAnuFaCTurEr" "mOdel"        "DiSPL"        "yEar"         "cYL"         
##  [6] "TRAns"        "DrV"          "cTY"          "HWy"          "fL"          
## [11] "ClasS"
# selecting a random set of columns per data frame
inconsistent_dframes <- map(inconsistent_dframes, ~ .x[sample(1:length(.x), sample(1:length(.x), 1))])
map(inconsistent_dframes, ~ colnames(.x)) %>%
  head()
## [[1]]
##  [1] "MoDel"        "Fl"           "CTY"          "DRv"          "CYl"         
##  [6] "clASs"        "ManUfactuREr" "TRans"        "yeAR"         "HwY"         
## 
## [[2]]
## [1] "CtY"          "MANUfaCturER" "year"         "cYl"          "mODel"       
## [6] "cLaSs"       
## 
## [[3]]
##  [1] "cyl"          "MAnuFAcTuReR" "tRaNs"        "Drv"          "clASS"       
##  [6] "cty"          "mOdEL"        "hWY"          "YEAR"         "fl"          
## [11] "DIsPL"       
## 
## [[4]]
## [1] "clAsS" "TRanS"
## 
## [[5]]
## [1] "disPL"
## 
## [[6]]
##  [1] "mOdel"        "cTY"          "cYL"          "HWy"          "ClasS"       
##  [6] "MAnuFaCTurEr" "TRAns"        "fL"           "DrV"          "yEar"
# saving to disk
# dir_create(c("input/unclean_files"))
# iwalk(inconsistent_dframes, ~ write_csv(.x, paste0("input/unclean_files/", .y, ".csv")))

# loading and cleaning the data frames
many_columns_data_frame <- dir_ls(path = "input/unclean_files/", glob = "*.csv", type = "file") %>%
  map_dfr(~ read_csv(.x, name_repair = tolower, show_col_types = FALSE) %>% 
            mutate(filename = .x))

# showing results
many_columns_data_frame %>% 
  glimpse()
## Rows: 200
## Columns: 12
## $ trans        <chr> "manual(m5)", "manual(m5)", "manual(m6)", "auto(l5)", "ma…
## $ model        <chr> "impreza awd", "tiburon", "a4 quattro", "a4", "ram 1500 p…
## $ year         <dbl> 1999, 2008, 2008, 1999, 2008, 1999, 2008, 1999, 1999, 200…
## $ fl           <chr> "r", "r", "p", "p", "e", "r", "r", "r", "r", "r", "r", "p…
## $ displ        <dbl> 2.2, 2.0, 2.0, 2.8, 4.7, 3.8, 2.5, 3.8, 1.6, 4.2, 2.2, 1.…
## $ class        <chr> "subcompact", "subcompact", "compact", "compact", "pickup…
## $ cyl          <dbl> 4, 4, 4, 6, 8, 6, 5, 6, 4, 8, 4, 4, 8, 4, 6, 8, 5, 8, 4, …
## $ filename     <fs::path> "input/unclean_files/1.csv", "input/unclean_files/1.…
## $ drv          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hwy          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ cty          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ manufacturer <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# files not in the same folder
mpg_samples3 <- map(1:40, ~ slice_sample(mpg, n = 20)) 

# Create directories
# dir_create(c("input/nested_folders", "input/nested_folders/first_nested_folder", "input/nested_folders/second_nested_folder"))

# First folder 
# iwalk(mpg_samples[1:20], ~ write_csv(.x, paste0("input/nested_folders/first_nested_folder/", .y, "_first.csv")))

# Second folder 
# iwalk(mpg_samples[21:40], ~ write_csv(.x, paste0("input/nested_folders/second_nested_folder/", .y, "_second.csv")))

# searching through nested folders recursively
(csv_files_nested <- dir_ls("input/nested_folders/", glob = "*.csv", type = "file", recurse = TRUE))
## input/nested_folders/first_nested_folder/10_first.csv
## input/nested_folders/first_nested_folder/11_first.csv
## input/nested_folders/first_nested_folder/12_first.csv
## input/nested_folders/first_nested_folder/13_first.csv
## input/nested_folders/first_nested_folder/14_first.csv
## input/nested_folders/first_nested_folder/15_first.csv
## input/nested_folders/first_nested_folder/16_first.csv
## input/nested_folders/first_nested_folder/17_first.csv
## input/nested_folders/first_nested_folder/18_first.csv
## input/nested_folders/first_nested_folder/19_first.csv
## input/nested_folders/first_nested_folder/1_first.csv
## input/nested_folders/first_nested_folder/20_first.csv
## input/nested_folders/first_nested_folder/2_first.csv
## input/nested_folders/first_nested_folder/3_first.csv
## input/nested_folders/first_nested_folder/4_first.csv
## input/nested_folders/first_nested_folder/5_first.csv
## input/nested_folders/first_nested_folder/6_first.csv
## input/nested_folders/first_nested_folder/7_first.csv
## input/nested_folders/first_nested_folder/8_first.csv
## input/nested_folders/first_nested_folder/9_first.csv
## input/nested_folders/second_nested_folder/10_second.csv
## input/nested_folders/second_nested_folder/11_second.csv
## input/nested_folders/second_nested_folder/12_second.csv
## input/nested_folders/second_nested_folder/13_second.csv
## input/nested_folders/second_nested_folder/14_second.csv
## input/nested_folders/second_nested_folder/15_second.csv
## input/nested_folders/second_nested_folder/16_second.csv
## input/nested_folders/second_nested_folder/17_second.csv
## input/nested_folders/second_nested_folder/18_second.csv
## input/nested_folders/second_nested_folder/19_second.csv
## input/nested_folders/second_nested_folder/1_second.csv
## input/nested_folders/second_nested_folder/20_second.csv
## input/nested_folders/second_nested_folder/2_second.csv
## input/nested_folders/second_nested_folder/3_second.csv
## input/nested_folders/second_nested_folder/4_second.csv
## input/nested_folders/second_nested_folder/5_second.csv
## input/nested_folders/second_nested_folder/6_second.csv
## input/nested_folders/second_nested_folder/7_second.csv
## input/nested_folders/second_nested_folder/8_second.csv
## input/nested_folders/second_nested_folder/9_second.csv
map_dfr(csv_files_nested, ~ read_csv(.x, show_col_types = FALSE) %>% 
          mutate(filename = .x)) %>%
  glimpse()
## Rows: 800
## Columns: 12
## $ manufacturer <chr> "toyota", "jeep", "honda", "volkswagen", "toyota", "subar…
## $ model        <chr> "camry", "grand cherokee 4wd", "civic", "new beetle", "co…
## $ displ        <dbl> 2.4, 4.7, 1.6, 2.5, 1.8, 2.5, 2.0, 5.7, 4.0, 5.9, 4.0, 2.…
## $ year         <dbl> 2008, 2008, 1999, 2008, 1999, 2008, 1999, 1999, 1999, 199…
## $ cyl          <dbl> 4, 8, 4, 5, 4, 4, 4, 8, 6, 8, 6, 4, 8, 8, 8, 4, 8, 6, 4, …
## $ trans        <chr> "manual(m5)", "auto(l5)", "manual(m5)", "manual(m5)", "ma…
## $ drv          <chr> "f", "4", "f", "f", "f", "4", "f", "r", "4", "4", "f", "4…
## $ cty          <dbl> 21, 9, 23, 20, 26, 20, 19, 13, 14, 11, 16, 20, 11, 12, 9,…
## $ hwy          <dbl> 31, 12, 29, 28, 35, 27, 26, 17, 17, 15, 23, 27, 15, 16, 1…
## $ fl           <chr> "r", "e", "p", "r", "r", "r", "r", "r", "r", "r", "r", "r…
## $ class        <chr> "midsize", "suv", "subcompact", "subcompact", "compact", …
## $ filename     <fs::path> "input/nested_folders/first_nested_folder/10_first.c…
# selecting the files to import from a string pattern
csv_files_nested[str_detect(csv_files_nested, pattern = "[2-4]_first|second\\.csv$", negate = TRUE)] %>% 
  map_dfr(~ read_csv(.x, show_col_types = FALSE) %>% 
            mutate(filename = .x)) %>% 
  glimpse()
## Rows: 280
## Columns: 12
## $ manufacturer <chr> "toyota", "jeep", "honda", "volkswagen", "toyota", "subar…
## $ model        <chr> "camry", "grand cherokee 4wd", "civic", "new beetle", "co…
## $ displ        <dbl> 2.4, 4.7, 1.6, 2.5, 1.8, 2.5, 2.0, 5.7, 4.0, 5.9, 4.0, 2.…
## $ year         <dbl> 2008, 2008, 1999, 2008, 1999, 2008, 1999, 1999, 1999, 199…
## $ cyl          <dbl> 4, 8, 4, 5, 4, 4, 4, 8, 6, 8, 6, 4, 8, 8, 8, 4, 8, 6, 4, …
## $ trans        <chr> "manual(m5)", "auto(l5)", "manual(m5)", "manual(m5)", "ma…
## $ drv          <chr> "f", "4", "f", "f", "f", "4", "f", "r", "4", "4", "f", "4…
## $ cty          <dbl> 21, 9, 23, 20, 26, 20, 19, 13, 14, 11, 16, 20, 11, 12, 9,…
## $ hwy          <dbl> 31, 12, 29, 28, 35, 27, 26, 17, 17, 15, 23, 27, 15, 16, 1…
## $ fl           <chr> "r", "e", "p", "r", "r", "r", "r", "r", "r", "r", "r", "r…
## $ class        <chr> "midsize", "suv", "subcompact", "subcompact", "compact", …
## $ filename     <fs::path> "input/nested_folders/first_nested_folder/10_first.c…

3.2.2 Writing data

The write_csv function writes tabular data to an ASCII file in CSV format. Each row of data creates one line in the file, with data items separated by commas (,).

# write_csv(horas_sol, "output/horas_sol.csv")

3.3 Packages

3.3.1 The magic of rio

“The aim of rio is to make data file I/O [import/output] in R as easy as possible by implementing three simple functions in Swiss-army knife style,” according to the project’s GitHub page. Those functions are import(), export(), and convert(). So, the rio package has just one function to read in many different types of files: import(). Once you’ve analyzed your data, if you want to save the results as a CSV, Excel spreadsheet, or other format, rio’s export() function can handle that. You can use R’s download.file function with the syntax download.file("url", "destinationFileName.csv") to download files directly from the web. It’s possible rio will ask you to re-download the file in binary format, in which case you’ll need to run download.file(“http://bit.ly/BostonSnowfallCSV”, “BostonWinterSnowfalls.csv”, mode=‘wb").

# getting data from the web with R-built-in
download.file("http://bit.ly/BostonSnowfallCSV", "input/BostonWinterSnowfalls.csv")
# download.file(“http://bit.ly/BostonSnowfallCSV”, “BostonWinterSnowfalls.csv”, mode=‘wb")

# import data with rio locally
snowdata2 <- rio::import("input/BostonWinterSnowfalls.csv")
suicides <- rio::import("input/PDT-suicidesData.csv")
# rio::import("mySpreadsheet.xlsx", which = 2, col_names = c("City", "State", "Population"))

3.3.1.1 Import a file from the Web

If you want to download and import a file from the Web, you can do so if it’s publicly available and in a format such as Excel or CSV you can use rio. A lot of systems will be able to follow the redirect URL to the file even after first giving you an error message, as long as you specify the format as “csv” since the file name here doesn’t include “.csv”. rio can also import well-formatted HTML tables from Web pages, but the tables have to be extremely well-formatted. In real life, though, Web data rarely appears in such neat, isolated form. A good option for cases that aren’t quite as well crafted is often the htmltab package. Since it wasn’t specified which table, it pulled the first HTML table on the page. To download a specific table use the which argument.

The most popular way to install packages from GitHub is to use a package called devtools. devtools is an extremely powerful package designed mostly for people who want to write their own packages, and it includes a few ways to install packages from other places besides CRAN. However, devtools usually requires a couple of extra steps to install compared to a typical package. However, the pacman package will also install packages from non-CRAN sources like GitHub. You can use the number_with_commas() function to change those character strings that should be numbers back into numbers. The rmiscutils package isn’t the only way to deal with imported numbers that have commas, the tidyverse readr package also includes a function that turns character strings into numbers, parse_number(). One advantage of readr::parse_number() is that you can define your own locale() to control things like encoding and decimal marks. There’s an R package called janitor that can automatically fix troublesome column names imported from a non-R-friendly data source. You can create new clean column names using janitor’s clean_names() function.

# getting data from the web with rio
snowdata3 <- rio::import("http://bit.ly/BostonSnowfallCSV", format = "csv")

# getting html tables
design.tokens1 <- rio::import("https://designsystem.digital.gov/design-tokens/", format = "html")
citytable <- htmltab("https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population", which = 5)
design.tokens2 <- htmltab("https://designsystem.digital.gov/design-tokens/", which = 6)

# installing packages from GitHub with pacman
pacman::p_load_gh("smach/rmiscutils")

# changing those character strings that should be numbers back into numbers
citytable$PopEst2021 <- number_with_commas(citytable$`2021estimate`)
citytable$Census2020 <- readr::parse_number(citytable$`2020census`)

# cleaning cols names
citytable_cleaned <- janitor::clean_names(citytable)
names(citytable_cleaned)
##  [1] "x2021rank"                  "city"                      
##  [3] "state"                      "x2021estimate"             
##  [5] "x2020census"                "change"                    
##  [7] "x2020_land_area"            "x2020_land_area_2"         
##  [9] "x2020_population_density"   "x2020_population_density_2"
## [11] "location"                   "pop_est2021"               
## [13] "census2020"

3.3.1.2 Import data from packages

If you are interested in state or local government data in the US or Canada, you may want to check out RSocrata to see if an agency you’re interested in posts data there. I’ve yet to find a complete list of all available Socrata data sets, but there’s a search page at https://www.opendatanetwork.com.

3.3.2 What’s a data frame? And what can you do with one?

It’s easy to add a column to a data frame. The name of the new column is on the left, and there’s a formula on the right. Some of these special dataframe functions (technically called “methods”) not only give you information, but let you change characteristics of the data frame. So, names(snowdata) tells you the column names in the data frame but it will change the column names in the data frame.

# adding cols
snowdata$Meters <- snowdata$Total * 0.0254

# changing col names
names(snowdata) <- c("Winter", "SnowInches", "SnowMeters")

# changing from num to chr
download.file("https://raw.githubusercontent.com/smach/R4JournalismBook/master/data/bostonzips.txt", "input/bostonzips.txt")
zips <- rio::import("input/bostonzips.txt", colClasses = c("character", "character"))
# or
# zips <- rio::import("input/bostonzips.txt", colClasses = rep("character", 2))
# rep("character", 2) is the same as c(“character”, “character”), so colClasses = rep("character", 2) is equivalent to colClasses = c("character", "character")

3.3.3 Exporting data

Often after you’ve wrangled your data in R, you’ll want to save your results. Here are some of the ways to export your data:

  • Save to a CSV file with rio::export(myObjectName, file="myFileName.csv") and to an Excel file with rio::export(myObjectName, file="myFileName.xlsx"). rio understands what file format you want based on the extension of the file name. There are several other available formats, including .tsv for tab-separated data, .json for JSON and .xml for XML.

  • Save to an R binary object that makes it easy to load back into R in future sessions. There are two options.

Generic save() will save one or more objects into a file, such as save(objectName1, objectName2, file="myfilename.RData"). To read this data back into R, you just use the command load("myfilename.RData") and all the objects return with the same names in the same state they had before.

You can also save a single object into a file with saveRDS(myobject, file="filename.rds"). The logical assumption would be that loadRDS would read the file back in, but instead the command is readRDS – and in this case, just the data has been stored, not the object name. So, you need to read the data into a new object name, such as mydata <- readRDS("filename.rds").

You can also export an R object into your Windows or Mac clipboard with rio: rio::export(myObjectName, format = "clipboard"). And, you can import data into R from your clipboard the same way: rio::import(file = "clipboard").

rio’s convert() function lets you convert one file type to another without having to manually pull the data into and then out of R.

The openxlsx package makes writing to Excel files relatively easy. While there are lots of options in openxlsx, a typical pattern is to specify an Excel filename and a sheet name.

# write.xlsx(horas_sol, sheetName = "horas_sol", file = "output/horas_sol.xlsx")

3.3.4 Additional resources

While rio is a great Swiss Army knife of file handling, there may be times when you want a bit more control over how your data is pulled into or saved out of R. In addition, there have been times when I’ve had a challenging data file that rio choked on but another package could handle. Some other functions and packages you may want to explore:

  • Base R’s read.csv() and read.table() to import text files (use ?read.csv and ?read.table to get more information). stringsAsFactors = FALSE is needed with these if you want to keep your character strings as character strings. write.csv() will save to CSV.

  • Wickham’s readr package is also worth a look as part of the “tidyverse.” readr includes functions to read CSV, tab-separated, fixed-width, Web logs, and several other types of files. readr prints out the type of data it has determined for each column – integer, character, double (non-whole numbers), etc. It creates tibbles.

  • The googlesheets package lets you import data from a Google Sheet, even if it’s private, by authenticating your Google account. The package is available on CRAN; install it with with install.packages("googlesheets").

If you are working with large data sets, speed may become important to you when saving and loading files. The data.table package has a speedy fread() function, but beware that resulting objects are data.tables and not plain data frames; some behaviors are different. If you want a conventional data frame, you can get one with the as.data.frame(mydatatable) syntax. fwrite() function is aimed at writing to a CSV file considerably faster than base R’s write.csv().

The feather package saves in a binary format that can be read either into R or Python. And, the fst package’s read.fst() and write.fst() offer fast saving and loading of R data frame objects – plus the option of file compression.

4 Reshaping data

4.1 Base-R

Often the values required for a particular operation can be found in a data frame, but they are not organized in the appropriate way. As a simple example, data for multiple groups are often stored in spreadsheets or data summaries as columns, with a separate column for each group. Most of the modeling and graphics functions in R will not be able to work with such data; they expect the values to be in a single column with an additional column that specifies the group from which the data arose. The stack function can reorganize datasets to have this property. If there were other variables in the data frame that did not need to be converted to this form, the select= argument to stack allows you to specify the variables that should be used, similar to the same argument to the subset function. The unstack function will reorganize stacked data back to the one column per group form. To use unstack, a formula must be provided to explain the roles of the variables to be unstacked.

For more complex reorganizations, the concept of “wide” versus “long” datasets is often helpful. When there are multiple occurrences of values for a single observation, a data frame is said to be long if each occurrence is a separate row in the data frame; if all of the occurrences of values for a given observation are in the same row, then the dataset is said to be wide. The reshape function converts datasets between these two forms. Perhaps the most common use of reshape involves repeated measures analyses, where the same variable is recorded for each observation at several different times.

To use reshape to convert the dataset to wide format, we need to provide five arguments. The first argument is the data frame to be reshaped. The next three arguments provide the names of the columns that will be involved in the reshaping. The idvar= argument provides the names of the variables that define the experimental unit which was repeatedly measured. In this case, it”s the subj variable. The v.names= argument tells reshape which variables in the long format will be used to create the multiple variables in the wide format. In this example, we want both x and y be to be expanded to multiple variables, so we”d specify a vector with both those names. The timevar= variable tells which variable identifies the sequence number that will be used to create the multiple versions of the v.names variables; in this case it will be time. Finally, the direction= argument accepts values of “wide” or “long”, depending on which transformation is to be performed.

The names x.1, y.1, etc. were formed by joining together the variable names of the variables specified in the v.names= argument with the values of the timevar= variable. Any variables not specified in the v.names= argument are assumed to be constant for all observations with the same values as the idvar= variables, and a single copy of such variables will be included in the output data frame. Only the variables whose names appear in the v.names= argument will be converted into multiple variables, so if any variables that are in the data frame but not in the v.names= argument are not constant, reshape will print a warning message, and use the first value of such variables when converting to wide format. To prevent variables from being transferred to the output data frame, the drop= argument can be used to pass a vector of variable names to be ignored in the conversion.

The information about the reshaping procedure is stored as attributes in converted data frames, so once a data frame has been converted with reshape, it can be changed to its previous format by passing just the data frame with no additional arguments to reshape.

Since reshape can handle multiple sets of variables, the varying= argument should be passed a list containing vectors with the names of the different sets of variables that should be mapped to a single variable in the long dataset. The automatically generated variable id is simply a numeric index corresponding to the type variable; using idvar="type" will suppress its creation. The automatically generated variable time defaults to a set of consecutive integers; providing more meaningful values through the times= argument will label the values properly. Finally, the name of the column representing the values (which defaults to the first name in the varying= argument) can be set to a more meaningful name with the v.names= argument.

The reshape package uses the concept of “melting” a dataset (through the melt function) into a data frame which contains separate columns for each id variable, a variable column containing the name of each measured variable, and a final column named value with the variable”s value. It may be noticed that this melting operation is essentially a “wide-to-long” reshaping of the data.

For long-to-wide conversions, recall that variables appearing to the left of the tilde in the formula passed to cast will appear in the columns of the output, while those on the right will appear in the rows.

At the most basic level, two or more data frames can be combined by rows using rbind, or by columns using cbind. For rbind, the data frames must have the same number of columns; for cbind, the data frames must have the same number of rows. Vectors or matrices passed to cbind will be converted to data frames, so the mode of columns passed to cbind will be preserved. While cbind will demand that data frames and matrices are conformable (that is, they have the same number of rows), vectors passed to cbind will be recycled if the number of rows in the data frame or matrix is an even multiple of the length of the vector. It may be a good idea to use unique names when combining data frames in this way. An easy way to test is to pass the names of the two data frames to the intersect function. When using rbind, the names and classes of values to be joined must match, or a variety of errors may occur.

Although the rbind function will demand that the names of the objects being combined agree, cbind does not do any such checking. To combine data frames based on the values of common variables, the merge function should be used. This function is designed to provide the same sort of functionality and behavior as the table joins provided by relational databases. Although merge is limited to operating on two data frames at a time, it can be called repeatedly to deal with more than two data frames. The default behavior of merge is to join together rows of the data frames based on the values of all of the variables (columns) that the data frames have in common. (In database terminology, this is known as a natural join.) When called without any other arguments, merge returns only those rows which had observations in both data frames.

Although there were six unique values for a between the two data frames, only those rows with values of a in both data frames are represented in the output. To modify this, the all=, all.x=, and all.y= arguments can be used. Specifying all=TRUE will include all rows (full outer join, in database terminology), all.x=TRUE will include all rows from the first data frame (left outer join), and all.y=TRUE does the same for the second data frame (right outer join).

To take more control over which variables are used to merge rows of the data frame, the by= argument can be used. You provide the by= argument with a vector of the name or names of the variables that should be used for the merge. If the merging variables have different names in the data frames to be merged, the by.x= and by.y= arguments can be used.

mydata <- data.frame(grp1 = c(12, 15, 19, 22, 25), grp2 = c(18, 12, 42, 29, 44), grp3 = c(8, 17, 22, 19, 31))

# reshaping data frame
sdata <- stack(mydata)
sdata
##    values  ind
## 1      12 grp1
## 2      15 grp1
## 3      19 grp1
## 4      22 grp1
## 5      25 grp1
## 6      18 grp2
## 7      12 grp2
## 8      42 grp2
## 9      29 grp2
## 10     44 grp2
## 11      8 grp3
## 12     17 grp3
## 13     22 grp3
## 14     19 grp3
## 15     31 grp3
# converting the the original form
mydata <- unstack(sdata, values ~ ind)
mydata
##   grp1 grp2 grp3
## 1   12   18    8
## 2   15   12   17
## 3   19   42   22
## 4   22   29   19
## 5   25   44   31
# using reshape
set.seed(17)
obs <- data.frame(subj = rep(1:4, rep(3, 4)), time = rep(1:3), x = rnorm(12), y = rnorm(12))
head(obs)
##   subj time           x           y
## 1    1    1 -1.01500872  1.29532187
## 2    1    2 -0.07963674  0.18791807
## 3    1    3 -0.23298702  1.59120510
## 4    2    1 -0.81726793 -0.05517906
## 5    2    2  0.77209084  0.83847112
## 6    2    3 -0.16561194  0.15937013
# from long to wide
wideobs <- reshape(obs, idvar = "subj", v.names = c("x", "y"), timevar = "time", direction = "wide")
head(wideobs)
##    subj        x.1         y.1         x.2        y.2        x.3       y.3
## 1     1 -1.0150087  1.29532187 -0.07963674  0.1879181 -0.2329870 1.5912051
## 4     2 -0.8172679 -0.05517906  0.77209084  0.8384711 -0.1656119 0.1593701
## 7     3  0.9728744  0.62595440  1.71653398  0.6335847  0.2552370 0.6810276
## 10    4  0.3665811 -0.68203337  1.18078924 -0.7232567  0.6431921 1.6735260
# from wide to long
obs <- reshape(wideobs)
head(obs)
##     subj time           x           y
## 1.1    1    1 -1.01500872  1.29532187
## 2.1    2    1 -0.81726793 -0.05517906
## 3.1    3    1  0.97287443  0.62595440
## 4.1    4    1  0.36658112 -0.68203337
## 1.2    1    2 -0.07963674  0.18791807
## 2.2    2    2  0.77209084  0.83847112
# from wide to long (complex example)
usp <- data.frame(type = rownames(USPersonalExpenditure), USPersonalExpenditure, row.names = NULL)
head(usp)
##                  type  X1940  X1945 X1950 X1955 X1960
## 1    Food and Tobacco 22.200 44.500 59.60  73.2 86.80
## 2 Household Operation 10.500 15.500 29.00  36.5 46.20
## 3  Medical and Health  3.530  5.760  9.71  14.0 21.10
## 4       Personal Care  1.040  1.980  2.45   3.4  5.40
## 5   Private Education  0.341  0.974  1.80   2.6  3.64
rr <- reshape(usp, varying = list(names(usp)[-1]), idvar = "type", times = seq(1940, 1960, by = 5), v.names = "expend", direction = "long")
head(rr)
##                                         type time expend
## Food and Tobacco.1940       Food and Tobacco 1940 22.200
## Household Operation.1940 Household Operation 1940 10.500
## Medical and Health.1940   Medical and Health 1940  3.530
## Personal Care.1940             Personal Care 1940  1.040
## Private Education.1940     Private Education 1940  0.341
## Food and Tobacco.1945       Food and Tobacco 1945 44.500
# an alternative way of reshaping the usp data frame, without having to explicitly provide the values of the times
rr1 <- reshape(usp, varying = names(usp)[-1], idvar = "type", split = list(regexp = "X1", include = TRUE), direction = "long")
head(rr1)
##                                         type time      X
## Food and Tobacco.1940       Food and Tobacco 1940 22.200
## Household Operation.1940 Household Operation 1940 10.500
## Medical and Health.1940   Medical and Health 1940  3.530
## Personal Care.1940             Personal Care 1940  1.040
## Private Education.1940     Private Education 1940  0.341
## Food and Tobacco.1945       Food and Tobacco 1945 44.500
# using melt from reshape package (wide-to-long)
musp = reshape::melt(usp)
# or
# reshape::cast(musp,variable + type ~ .)
head(musp)
##                  type variable  value
## 1    Food and Tobacco    X1940 22.200
## 2 Household Operation    X1940 10.500
## 3  Medical and Health    X1940  3.530
## 4       Personal Care    X1940  1.040
## 5   Private Education    X1940  0.341
## 6    Food and Tobacco    X1945 44.500
# getting rid of "X" and changing type to numeric
musp$variable <- as.numeric(sub("X", "", musp$variable))
# renaming columns
names(musp)[2:3] <- c("time", "expend")
head(musp)
##                  type time expend
## 1    Food and Tobacco 1940 22.200
## 2 Household Operation 1940 10.500
## 3  Medical and Health 1940  3.530
## 4       Personal Care 1940  1.040
## 5   Private Education 1940  0.341
## 6    Food and Tobacco 1945 44.500
# using cast from reshape package (long-to-wide)
set.seed(999)
obs2 <- data.frame(subj = rep(1:4, rep(3, 4)), time = rep(1:3), x = rnorm(12), y = rnorm(12))
mobs <- reshape::melt(obs2)
# reshape::cast(subj ~ variable + time, data = mobs)

# combining data sets using cbind and rbind
x <- data.frame(a = c("A", "B", "C"), x = c(12, 15, 19))
y <- data.frame(a = c("D", "E", "F", "G"), x = c(19, 21, 14, 12))
intersect(names(x), names(y))
## [1] "a" "x"
cbind(y, z = c(1, 2))
##   a  x z
## 1 D 19 1
## 2 E 21 2
## 3 F 14 1
## 4 G 12 2
# combining data sets based on the values of common variables
x <- data.frame(a = c(1, 2, 4, 5, 6), x = c(9, 12, 14, 21, 8))
y <- data.frame(a = c(1, 3, 4, 6), y = c(8, 14, 19, 2))
merge(x, y)
##   a  x  y
## 1 1  9  8
## 2 4 14 19
## 3 6  8  2
# outer join
merge(x, y, all = TRUE)
##   a  x  y
## 1 1  9  8
## 2 2 12 NA
## 3 3 NA 14
## 4 4 14 19
## 5 5 21 NA
## 6 6  8  2
# left outer join
merge(x, y, all.x = TRUE)
##   a  x  y
## 1 1  9  8
## 2 2 12 NA
## 3 4 14 19
## 4 5 21 NA
## 5 6  8  2
# right outer join
merge(x, y, all.y = TRUE)
##   a  x  y
## 1 1  9  8
## 2 3 NA 14
## 3 4 14 19
## 4 6  8  2
cities <- data.frame(city = c("New York", "Boston", "Juneau", "Anchorage", "San Diego", "Philadelphia", "Los Angeles", "Fairbanks", "Ann Arbor", "Seattle"), state.abb = c("NY", "MA", "AK", "AK", "CA", "PA", "CA", "AK", "MI", "WA"))
states <- data.frame(state.abb = c("NY", "MA", "AK", "CA", "PA", "MI", "WA"), state = c("New York", "Massachusetts", "Alaska", "California", "Pennsylvania", "Michigan", "Washington"))

merge(cities, states)
##    state.abb         city         state
## 1         AK       Juneau        Alaska
## 2         AK    Anchorage        Alaska
## 3         AK    Fairbanks        Alaska
## 4         CA    San Diego    California
## 5         CA  Los Angeles    California
## 6         MA       Boston Massachusetts
## 7         MI    Ann Arbor      Michigan
## 8         NY     New York      New York
## 9         PA Philadelphia  Pennsylvania
## 10        WA      Seattle    Washington
# another example
# mayordata <- merge(contributions_split, results_split, all.x = TRUE, all.y = TRUE, by.x = "LastName", by.y = "LastName")

4.2 Tidyverse

There are three interrelated rules which make a dataset tidy: 1. Each variable must have its own column. 2. Each observation must have its own row. 3. Each value must have its own cell. These three rules are interrelated because it”s impossible to only satisfy two of the three: 1. Put each dataset in a tibble. 2. Put each variable in a column.

  • Wide form: every row corresponds to a unique subject.
  • Long form: every row corresponds to a unique measurement.

Tidy data set

pivot_longer() “lengthens” data, increasing the number of rows and decreasing the number of columns. The inverse transformation is pivot_wider(). pivot_longer() is an updated approach to gather(), designed to be both simpler to use and to handle more use cases. We recommend you use pivot_longer() for new code; gather() isn’t going away but is no longer under active development.

pivot_wider() “widens” data, increasing the number of columns and decreasing the number of rows. The inverse transformation is pivot_longer(). pivot_wider() is an updated approach to spread(), designed to be both simpler to use and to handle more use cases. We recommend you use pivot_wider() for new code; spread() isn’t going away but is no longer under active development.

# wide to long
# where column names are character data
head(relig_income)
## # A tibble: 6 × 11
##   religion       `<$10k` $10-2…¹ $20-3…² $30-4…³ $40-5…⁴ $50-7…⁵ $75-1…⁶ $100-…⁷
##   <chr>            <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 Agnostic            27      34      60      81      76     137     122     109
## 2 Atheist             12      27      37      52      35      70      73      59
## 3 Buddhist            27      21      30      34      33      58      62      39
## 4 Catholic           418     617     732     670     638    1116     949     792
## 5 Don’t know/re…      15      14      15      11      10      35      21      17
## 6 Evangelical P…     575     869    1064     982     881    1486     949     723
## # … with 2 more variables: `>150k` <dbl>, `Don't know/refused` <dbl>, and
## #   abbreviated variable names ¹​`$10-20k`, ²​`$20-30k`, ³​`$30-40k`, ⁴​`$40-50k`,
## #   ⁵​`$50-75k`, ⁶​`$75-100k`, ⁷​`$100-150k`
relig_income %>%
  pivot_longer(!religion, names_to = "income", values_to = "count")
## # A tibble: 180 × 3
##    religion income             count
##    <chr>    <chr>              <dbl>
##  1 Agnostic <$10k                 27
##  2 Agnostic $10-20k               34
##  3 Agnostic $20-30k               60
##  4 Agnostic $30-40k               81
##  5 Agnostic $40-50k               76
##  6 Agnostic $50-75k              137
##  7 Agnostic $75-100k             122
##  8 Agnostic $100-150k            109
##  9 Agnostic >150k                 84
## 10 Agnostic Don't know/refused    96
## # … with 170 more rows
# columns have common prefix and missing missings are structural so should be dropped
head(billboard)
## # A tibble: 6 × 79
##   artist  track date.ent…¹   wk1   wk2   wk3   wk4   wk5   wk6   wk7   wk8   wk9
##   <chr>   <chr> <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 Pac   Baby… 2000-02-26    87    82    72    77    87    94    99    NA    NA
## 2 2Ge+her The … 2000-09-02    91    87    92    NA    NA    NA    NA    NA    NA
## 3 3 Door… Kryp… 2000-04-08    81    70    68    67    66    57    54    53    51
## 4 3 Door… Loser 2000-10-21    76    76    72    69    67    65    55    59    62
## 5 504 Bo… Wobb… 2000-04-15    57    34    25    17    17    31    36    49    53
## 6 98^0    Give… 2000-08-19    51    39    34    26    26    19     2     2     3
## # … with 67 more variables: wk10 <dbl>, wk11 <dbl>, wk12 <dbl>, wk13 <dbl>,
## #   wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>, wk19 <dbl>,
## #   wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>, wk24 <dbl>, wk25 <dbl>,
## #   wk26 <dbl>, wk27 <dbl>, wk28 <dbl>, wk29 <dbl>, wk30 <dbl>, wk31 <dbl>,
## #   wk32 <dbl>, wk33 <dbl>, wk34 <dbl>, wk35 <dbl>, wk36 <dbl>, wk37 <dbl>,
## #   wk38 <dbl>, wk39 <dbl>, wk40 <dbl>, wk41 <dbl>, wk42 <dbl>, wk43 <dbl>,
## #   wk44 <dbl>, wk45 <dbl>, wk46 <dbl>, wk47 <dbl>, wk48 <dbl>, wk49 <dbl>, …
billboard %>%
  pivot_longer(
    cols = starts_with("wk"),
    names_to = "week",
    names_prefix = "wk",
    values_to = "rank",
    values_drop_na = TRUE
  )
## # A tibble: 5,307 × 5
##    artist  track                   date.entered week   rank
##    <chr>   <chr>                   <date>       <chr> <dbl>
##  1 2 Pac   Baby Don't Cry (Keep... 2000-02-26   1        87
##  2 2 Pac   Baby Don't Cry (Keep... 2000-02-26   2        82
##  3 2 Pac   Baby Don't Cry (Keep... 2000-02-26   3        72
##  4 2 Pac   Baby Don't Cry (Keep... 2000-02-26   4        77
##  5 2 Pac   Baby Don't Cry (Keep... 2000-02-26   5        87
##  6 2 Pac   Baby Don't Cry (Keep... 2000-02-26   6        94
##  7 2 Pac   Baby Don't Cry (Keep... 2000-02-26   7        99
##  8 2Ge+her The Hardest Part Of ... 2000-09-02   1        91
##  9 2Ge+her The Hardest Part Of ... 2000-09-02   2        87
## 10 2Ge+her The Hardest Part Of ... 2000-09-02   3        92
## # … with 5,297 more rows
# multiple variables stored in column names
head(tidyr::who)
## # A tibble: 6 × 60
##   country     iso2  iso3   year new_sp…¹ new_s…² new_s…³ new_s…⁴ new_s…⁵ new_s…⁶
##   <chr>       <chr> <chr> <int>    <int>   <int>   <int>   <int>   <int>   <int>
## 1 Afghanistan AF    AFG    1980       NA      NA      NA      NA      NA      NA
## 2 Afghanistan AF    AFG    1981       NA      NA      NA      NA      NA      NA
## 3 Afghanistan AF    AFG    1982       NA      NA      NA      NA      NA      NA
## 4 Afghanistan AF    AFG    1983       NA      NA      NA      NA      NA      NA
## 5 Afghanistan AF    AFG    1984       NA      NA      NA      NA      NA      NA
## 6 Afghanistan AF    AFG    1985       NA      NA      NA      NA      NA      NA
## # … with 50 more variables: new_sp_m65 <int>, new_sp_f014 <int>,
## #   new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
## #   new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## #   new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
## #   new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
## #   new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
## #   new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>, …
tidyr::who %>%
  pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = c("diagnosis", "gender", "age"),
names_pattern = "new_?(.*)_(.)(.*)",
values_to = "count"
)
## # A tibble: 405,440 × 8
##    country     iso2  iso3   year diagnosis gender age   count
##    <chr>       <chr> <chr> <int> <chr>     <chr>  <chr> <int>
##  1 Afghanistan AF    AFG    1980 sp        m      014      NA
##  2 Afghanistan AF    AFG    1980 sp        m      1524     NA
##  3 Afghanistan AF    AFG    1980 sp        m      2534     NA
##  4 Afghanistan AF    AFG    1980 sp        m      3544     NA
##  5 Afghanistan AF    AFG    1980 sp        m      4554     NA
##  6 Afghanistan AF    AFG    1980 sp        m      5564     NA
##  7 Afghanistan AF    AFG    1980 sp        m      65       NA
##  8 Afghanistan AF    AFG    1980 sp        f      014      NA
##  9 Afghanistan AF    AFG    1980 sp        f      1524     NA
## 10 Afghanistan AF    AFG    1980 sp        f      2534     NA
## # … with 405,430 more rows
# multiple observations per row
head(anscombe)
##   x1 x2 x3 x4   y1   y2    y3   y4
## 1 10 10 10  8 8.04 9.14  7.46 6.58
## 2  8  8  8  8 6.95 8.14  6.77 5.76
## 3 13 13 13  8 7.58 8.74 12.74 7.71
## 4  9  9  9  8 8.81 8.77  7.11 8.84
## 5 11 11 11  8 8.33 9.26  7.81 8.47
## 6 14 14 14  8 9.96 8.10  8.84 7.04
anscombe %>% 
  pivot_longer(everything(),
 names_to = c(".value", "set"),
 names_pattern = "(.)(.)"
)
## # A tibble: 44 × 3
##    set       x     y
##    <chr> <dbl> <dbl>
##  1 1        10  8.04
##  2 2        10  9.14
##  3 3        10  7.46
##  4 4         8  6.58
##  5 1         8  6.95
##  6 2         8  8.14
##  7 3         8  6.77
##  8 4         8  5.76
##  9 1        13  7.58
## 10 2        13  8.74
## # … with 34 more rows
# long to wide
head(fish_encounters)
## # A tibble: 6 × 3
##   fish  station  seen
##   <fct> <fct>   <int>
## 1 4842  Release     1
## 2 4842  I80_1       1
## 3 4842  Lisbon      1
## 4 4842  Rstr        1
## 5 4842  Base_TD     1
## 6 4842  BCE         1
fish_encounters %>% 
  pivot_wider(names_from = station, values_from = seen)
## # A tibble: 19 × 12
##    fish  Release I80_1 Lisbon  Rstr Base_TD   BCE   BCW  BCE2  BCW2   MAE   MAW
##    <fct>   <int> <int>  <int> <int>   <int> <int> <int> <int> <int> <int> <int>
##  1 4842        1     1      1     1       1     1     1     1     1     1     1
##  2 4843        1     1      1     1       1     1     1     1     1     1     1
##  3 4844        1     1      1     1       1     1     1     1     1     1     1
##  4 4845        1     1      1     1       1    NA    NA    NA    NA    NA    NA
##  5 4847        1     1      1    NA      NA    NA    NA    NA    NA    NA    NA
##  6 4848        1     1      1     1      NA    NA    NA    NA    NA    NA    NA
##  7 4849        1     1     NA    NA      NA    NA    NA    NA    NA    NA    NA
##  8 4850        1     1     NA     1       1     1     1    NA    NA    NA    NA
##  9 4851        1     1     NA    NA      NA    NA    NA    NA    NA    NA    NA
## 10 4854        1     1     NA    NA      NA    NA    NA    NA    NA    NA    NA
## 11 4855        1     1      1     1       1    NA    NA    NA    NA    NA    NA
## 12 4857        1     1      1     1       1     1     1     1     1    NA    NA
## 13 4858        1     1      1     1       1     1     1     1     1     1     1
## 14 4859        1     1      1     1       1    NA    NA    NA    NA    NA    NA
## 15 4861        1     1      1     1       1     1     1     1     1     1     1
## 16 4862        1     1      1     1       1     1     1     1     1    NA    NA
## 17 4863        1     1     NA    NA      NA    NA    NA    NA    NA    NA    NA
## 18 4864        1     1     NA    NA      NA    NA    NA    NA    NA    NA    NA
## 19 4865        1     1      1    NA      NA    NA    NA    NA    NA    NA    NA
# filling in missing values
fish_encounters %>%
  pivot_wider(names_from = station, values_from = seen, values_fill = 0)
## # A tibble: 19 × 12
##    fish  Release I80_1 Lisbon  Rstr Base_TD   BCE   BCW  BCE2  BCW2   MAE   MAW
##    <fct>   <int> <int>  <int> <int>   <int> <int> <int> <int> <int> <int> <int>
##  1 4842        1     1      1     1       1     1     1     1     1     1     1
##  2 4843        1     1      1     1       1     1     1     1     1     1     1
##  3 4844        1     1      1     1       1     1     1     1     1     1     1
##  4 4845        1     1      1     1       1     0     0     0     0     0     0
##  5 4847        1     1      1     0       0     0     0     0     0     0     0
##  6 4848        1     1      1     1       0     0     0     0     0     0     0
##  7 4849        1     1      0     0       0     0     0     0     0     0     0
##  8 4850        1     1      0     1       1     1     1     0     0     0     0
##  9 4851        1     1      0     0       0     0     0     0     0     0     0
## 10 4854        1     1      0     0       0     0     0     0     0     0     0
## 11 4855        1     1      1     1       1     0     0     0     0     0     0
## 12 4857        1     1      1     1       1     1     1     1     1     0     0
## 13 4858        1     1      1     1       1     1     1     1     1     1     1
## 14 4859        1     1      1     1       1     0     0     0     0     0     0
## 15 4861        1     1      1     1       1     1     1     1     1     1     1
## 16 4862        1     1      1     1       1     1     1     1     1     0     0
## 17 4863        1     1      0     0       0     0     0     0     0     0     0
## 18 4864        1     1      0     0       0     0     0     0     0     0     0
## 19 4865        1     1      1     0       0     0     0     0     0     0     0
# generating column names from multiple variables
head(us_rent_income)
## # A tibble: 6 × 5
##   GEOID NAME    variable estimate   moe
##   <chr> <chr>   <chr>       <dbl> <dbl>
## 1 01    Alabama income      24476   136
## 2 01    Alabama rent          747     3
## 3 02    Alaska  income      32940   508
## 4 02    Alaska  rent         1200    13
## 5 04    Arizona income      27517   148
## 6 04    Arizona rent          972     4
us_rent_income %>% 
  pivot_wider(names_from = variable, values_from = c(estimate, moe))
## # A tibble: 52 × 6
##    GEOID NAME                 estimate_income estimate_rent moe_income moe_rent
##    <chr> <chr>                          <dbl>         <dbl>      <dbl>    <dbl>
##  1 01    Alabama                        24476           747        136        3
##  2 02    Alaska                         32940          1200        508       13
##  3 04    Arizona                        27517           972        148        4
##  4 05    Arkansas                       23789           709        165        5
##  5 06    California                     29454          1358        109        3
##  6 08    Colorado                       32401          1125        109        5
##  7 09    Connecticut                    35326          1123        195        5
##  8 10    Delaware                       31560          1076        247       10
##  9 11    District of Columbia           43198          1424        681       17
## 10 12    Florida                        25952          1077         70        3
## # … with 42 more rows
# when there are multiple `names_from` or `values_from`, you can use use `names_sep` or `names_glue` to control the output variable names
us_rent_income %>%
  pivot_wider(
    names_from = variable,
    names_sep = ".",
    values_from = c(estimate, moe)
  )
## # A tibble: 52 × 6
##    GEOID NAME                 estimate.income estimate.rent moe.income moe.rent
##    <chr> <chr>                          <dbl>         <dbl>      <dbl>    <dbl>
##  1 01    Alabama                        24476           747        136        3
##  2 02    Alaska                         32940          1200        508       13
##  3 04    Arizona                        27517           972        148        4
##  4 05    Arkansas                       23789           709        165        5
##  5 06    California                     29454          1358        109        3
##  6 08    Colorado                       32401          1125        109        5
##  7 09    Connecticut                    35326          1123        195        5
##  8 10    Delaware                       31560          1076        247       10
##  9 11    District of Columbia           43198          1424        681       17
## 10 12    Florida                        25952          1077         70        3
## # … with 42 more rows
us_rent_income %>%
 pivot_wider(
    names_from = variable,
    names_glue = "{variable}_{.value}",
    values_from = c(estimate, moe)
  )
## # A tibble: 52 × 6
##    GEOID NAME                 income_estimate rent_estimate income_moe rent_moe
##    <chr> <chr>                          <dbl>         <dbl>      <dbl>    <dbl>
##  1 01    Alabama                        24476           747        136        3
##  2 02    Alaska                         32940          1200        508       13
##  3 04    Arizona                        27517           972        148        4
##  4 05    Arkansas                       23789           709        165        5
##  5 06    California                     29454          1358        109        3
##  6 08    Colorado                       32401          1125        109        5
##  7 09    Connecticut                    35326          1123        195        5
##  8 10    Delaware                       31560          1076        247       10
##  9 11    District of Columbia           43198          1424        681       17
## 10 12    Florida                        25952          1077         70        3
## # … with 42 more rows
warpbreaks <- as_tibble(warpbreaks[c("wool", "tension", "breaks")])
head(warpbreaks)
## # A tibble: 6 × 3
##   wool  tension breaks
##   <fct> <fct>    <dbl>
## 1 A     L           26
## 2 A     L           30
## 3 A     L           54
## 4 A     L           25
## 5 A     L           70
## 6 A     L           52
warpbreaks %>%
  pivot_wider(
    names_from = wool,
    values_from = breaks,
    values_fn = mean
  )
## # A tibble: 3 × 3
##   tension     A     B
##   <fct>   <dbl> <dbl>
## 1 L        44.6  28.2
## 2 M        24    28.8
## 3 H        24.6  18.8

Existen otras tres funciones que son muy útiles para la manipulación de datos: separate(), que sirve para asegurar una columna en varias nuevas; case_when(), que sirve para establecer condionales y es similar a ifelse() y complete() que sirve para completar una variable o una combinación de variables.

Para mostrar la primera función convertimos la fecha en tres columnas: año (yr), mes (mo) y día (dy), cuyos elementos están separados por “-“. Por ello, es necesario indicar la columna afectada, los nombres de las nuevas columnas y el símbolo separador. La función mutate_all() aplica a todas las columnas otra función, en este caso, as.numeric() para convertir todas en numéricas.

En lugar de encapsular y encadenar ifelse(), podemos usar la función case_when(), en la que empleamos fórmulas en dos tiempos: por un lado la condición; por otro, la acción cuando se cumpla esa condición.

Nuestro conjunto de datos contiene datos diarios desde el 1 de enero de 1980 hasta el 31 de diciembre de 2015. Así que únicamente debemos crear un vector con fechas de este periodo. En la función complete() indicamos la columna que queremos completar y le asignamos el vector entero de fechas. El resultado es un nuevo data.frame con todas las fechas, rellenando el resto de columnas con NA.

The extract function is basically the separate function with super powers and works with groups instead of separators. The separate function allows you to split a character variable into multiple variables. The key difference between separate and extract is that extract works with groups within its regular expressions. Each captured group is converted into a new column. So instead of thinking of the separator in separate with extract, we think of groups. extract takes a few arguments:

  • col specifies the character column to be split into several columns.
  • into specifies the name of the columns to be created
  • regex defines the regular expression in which we capture the groups that will represent the new columns
  • remove tells the function if the original column should be removed (by default TRUE)

To extract columns that are more complicated and confusing, we need to learn the concept of non-grouping parentheses. Non-grouping parentheses define groups that are not captured. In other words, these groups are not converted into a new column. A non-grouping parenthesis is defined by a group that starts with a question mark and a colon: (?:). The advantage of this method is that we can solve column separation problems caused by messy or inconsistent variables.

tidyr’s separate() function will split a data frame column into multiple columns based on a delimiter of your choice. The syntax is separate(my_df, my_col_name, my_new_col_names, my_delimiter, my_number_of_new_columns).

Regular Expressions

ta_vigo <- read_csv( "input/TG_STAID001395.txt", skip = 20)
head(ta_vigo)
## # A tibble: 6 × 5
##   STAID SOUID     DATE    TG  Q_TG
##   <dbl> <dbl>    <dbl> <dbl> <dbl>
## 1  1395 20408 19560501 -9999     9
## 2  1395 20408 19560502 -9999     9
## 3  1395 20408 19560503 -9999     9
## 4  1395 20408 19560504 -9999     9
## 5  1395 20408 19560505 -9999     9
## 6  1395 20408 19560506 -9999     9
ta_vigo <- mutate(ta_vigo, DATE = lubridate::ymd(DATE), TG = ifelse(TG == -9999, NA, TG / 10)) %>%
  filter(DATE >= "1980-01-01", DATE <= "2015-12-31") %>%
  select(-STAID:-SOUID, -Q_TG) %>%
  rename(date = DATE)
head(ta_vigo)
## # A tibble: 6 × 2
##   date          TG
##   <date>     <dbl>
## 1 1980-01-01  11.2
## 2 1980-01-02   9.5
## 3 1980-01-03   7.5
## 4 1980-01-04  10.3
## 5 1980-01-05   7.8
## 6 1980-01-06   7.3
horas_sol <- read_csv("input/SS_STAID001395.txt", skip = 19)
horas_sol <- mutate(horas_sol, DATE = lubridate::ymd(DATE), SS = ifelse(SS == -9999, NA, SS / 10), month = month(DATE), year = year(DATE))
horas_sol <- rename(horas_sol, date = DATE, sunhours = SS)
horas_sol
## # A tibble: 22,494 × 6
##     SOUID date       sunhours  Q_SS month  year
##     <dbl> <date>        <dbl> <dbl> <int> <int>
##  1 120414 1956-05-01       NA     9     5  1956
##  2 120414 1956-05-02       NA     9     5  1956
##  3 120414 1956-05-03       NA     9     5  1956
##  4 120414 1956-05-04       NA     9     5  1956
##  5 120414 1956-05-05       NA     9     5  1956
##  6 120414 1956-05-06       NA     9     5  1956
##  7 120414 1956-05-07       NA     9     5  1956
##  8 120414 1956-05-08       NA     9     5  1956
##  9 120414 1956-05-09       NA     9     5  1956
## 10 120414 1956-05-10       NA     9     5  1956
## # … with 22,484 more rows
data_vigo <- left_join (ta_vigo, horas_sol, by = "date" )
data_vigo
## # A tibble: 13,149 × 7
##    date          TG  SOUID sunhours  Q_SS month  year
##    <date>     <dbl>  <dbl>    <dbl> <dbl> <int> <int>
##  1 1980-01-01  11.2 120414      0.3     0     1  1980
##  2 1980-01-02   9.5 120414      0.2     0     1  1980
##  3 1980-01-03   7.5 120414      1.2     0     1  1980
##  4 1980-01-04  10.3 120414      0.5     0     1  1980
##  5 1980-01-05   7.8 120414      3       0     1  1980
##  6 1980-01-06   7.3 120414      1.8     0     1  1980
##  7 1980-01-07   9.3 120414      0       0     1  1980
##  8 1980-01-08   9.8 120414      3.3     0     1  1980
##  9 1980-01-09   8.3 120414      0       0     1  1980
## 10 1980-01-10   6.5 120414      4.1     0     1  1980
## # … with 13,139 more rows
rio::export(data_vigo, file = "input/data_vigo.csv")

# using separate() 
# data_vigo <- rio::import("input/data_vigo.csv")
time_df <- select(data_vigo, date) %>%
  separate(date, c("yr", "mo", "dy"), sep = "-") %>%
  mutate_all(as.numeric)
head(time_df)
## # A tibble: 6 × 3
##      yr    mo    dy
##   <dbl> <dbl> <dbl>
## 1  1980     1     1
## 2  1980     1     2
## 3  1980     1     3
## 4  1980     1     4
## 5  1980     1     5
## 6  1980     1     6
# another example
contributions_split <- tidyr::separate(
  contributions, Candidate,
  c("LastName", "FirstName"), ", ", 2
) %>%
  select(-FirstName)
head(contributions_split)
##     LastName Pct_Local_Contributors
##     Horrigan            0.035820896
##  Neves-Grigg            0.011940299
##          Sen            0.008955224
##        Sousa            0.029850746
##       Spicer            0.516417910
##    Stefanini            0.337313433
results_split <- tidyr::separate(results, Candidate, c("FirstName", "MiddleName", "LastName"), " ")
tail(results_split)
## # A tibble: 6 × 4
##   FirstName MiddleName LastName     Pct_Vote
##   <chr>     <chr>      <chr>           <dbl>
## 1 John      A.         Stefanini     0.292  
## 2 Dhruba    P.         Sen           0.00926
## 3 Mark      S.         Tilden        0.0402 
## 4 Yvonne    M.         Spicer        0.547  
## 5 Benjaman  A.         Neves-Grigg,  0.0123 
## 6 Priscila  Sousa      <NA>          0.0493
results_split %<>%
  mutate(
    LastName = ifelse(is.na(LastName), MiddleName, LastName),
    LastName = str_replace(LastName, ",", "")
  ) %>%
  select(-FirstName, -MiddleName)
tail(results_split)
## # A tibble: 6 × 2
##   LastName    Pct_Vote
##   <chr>          <dbl>
## 1 Stefanini    0.292  
## 2 Sen          0.00926
## 3 Tilden       0.0402 
## 4 Spicer       0.547  
## 5 Neves-Grigg  0.0123 
## 6 Sousa        0.0493
# using case_when()
time_df <- mutate(time_df, season = case_when(
  mo %in% c(12, 1:2) ~ "invierno",
  mo %in% 3:5 ~ "primavera",
  mo %in% 6:8 ~ "verano",
  mo %in% 9:11 ~ "otoño"
))

# preparing the data
data_vigo <- gather(data_vigo, Variable, Valor, TG:sunhours)
data_vigo_subset <- filter(data_vigo, Variable == "TG") %>%
  slice(sample(nrow(data_vigo), 10000)) %>%
  arrange(date)
head(data_vigo_subset)
## # A tibble: 6 × 6
##   date        Q_SS month  year Variable Valor
##   <date>     <dbl> <int> <int> <chr>    <dbl>
## 1 1980-01-07     0     1  1980 TG         9.3
## 2 1980-01-11     0     1  1980 TG         7.1
## 3 1980-01-13     0     1  1980 TG         3.4
## 4 1980-01-17     0     1  1980 TG         4.5
## 5 1980-01-23     0     1  1980 TG        10.4
## 6 1980-01-27     0     1  1980 TG        12.7
date_ts <- seq(lubridate::ymd("1980-01-01"), lubridate::ymd("2015-12-31"), "day")

# using complete()
data_vigo_subset <- complete(data_vigo_subset, date = date_ts)
head(data_vigo_subset)
## # A tibble: 6 × 6
##   date        Q_SS month  year Variable Valor
##   <date>     <dbl> <int> <int> <chr>    <dbl>
## 1 1980-01-01    NA    NA    NA <NA>        NA
## 2 1980-01-02    NA    NA    NA <NA>        NA
## 3 1980-01-03    NA    NA    NA <NA>        NA
## 4 1980-01-04    NA    NA    NA <NA>        NA
## 5 1980-01-05    NA    NA    NA <NA>        NA
## 6 1980-01-06    NA    NA    NA <NA>        NA
# using extract()
# example with separate()
tibble(
  variable = c("a-b", "a-d", "b-c", "d-e")
) %>%
  separate(
    variable,
    into = c("a", "b"),
    sep = "-",
    remove = FALSE
  )
## # A tibble: 4 × 3
##   variable a     b    
##   <chr>    <chr> <chr>
## 1 a-b      a     b    
## 2 a-d      a     d    
## 3 b-c      b     c    
## 4 d-e      d     e
# with extract
tibble(
  variable = c("a-b", "a-d", "b-c", "d-e")
) %>%
  extract(
    col = variable,
    into = c("a", "b"),
    regex = "([a-z])-([a-z])",
    remove = FALSE
  )
## # A tibble: 4 × 3
##   variable a     b    
##   <chr>    <chr> <chr>
## 1 a-b      a     b    
## 2 a-d      a     d    
## 3 b-c      b     c    
## 4 d-e      d     e
# without separator
tibble(
  variable = c("x1", "x2", "y1", "y2")
) %>%
  extract(
    variable,
    into = c("letter", "number"),
    regex = "([xy])(\\d)",
    remove = FALSE
  )
## # A tibble: 4 × 3
##   variable letter number
##   <chr>    <chr>  <chr> 
## 1 x1       x      1     
## 2 x2       x      2     
## 3 y1       y      1     
## 4 y2       y      2
tibble(
  variable = c(
    "David Jude Heyworth Law", "Elton Hercules John",
    "Angelina Jolie Voight", "Jennifer Shrader Lawrence"
  )
) %>%
  extract(
    variable,
    into = c("short name", "remainder"),
    regex = "(\\w+) .* (\\w+)",
    remove = FALSE
  )
## # A tibble: 4 × 3
##   variable                  `short name` remainder
##   <chr>                     <chr>        <chr>    
## 1 David Jude Heyworth Law   David        Law      
## 2 Elton Hercules John       Elton        John     
## 3 Angelina Jolie Voight     Angelina     Voight   
## 4 Jennifer Shrader Lawrence Jennifer     Lawrence
# extracting from non-grouping parentheses
tibble(
  variable = c(
    "x -> 1",
    "y -> 2",
    "p-> 34"
  )
) %>%
  extract(
    variable,
    into = c("letter", "number"),
    remove = FALSE,
    regex = "([a-z])(?: ?-> ?)(\\d+)?"
  )
## # A tibble: 3 × 3
##   variable letter number
##   <chr>    <chr>  <chr> 
## 1 x -> 1   x      1     
## 2 y -> 2   y      2     
## 3 p-> 34   p      34
# another example
df <- tibble(
  variable = c(
    "x ->-> 1",
    "y -> 2",
    "p-> 34",
    "f 4"
  )
)

df %>%
  extract(
    variable,
    into = c("letter", "number"),
    remove = FALSE,
    regex = "([a-z]) ?(?:->){0,} ?(\\d+)?"
  )
## # A tibble: 4 × 3
##   variable letter number
##   <chr>    <chr>  <chr> 
## 1 x ->-> 1 x      1     
## 2 y -> 2   y      2     
## 3 p-> 34   p      34    
## 4 f 4      f      4
# another one
df <- tibble(
  variable = c(
    "x ->aslkdfj 1", "y-> 2",
    "p 34",
    "8"
  )
)

df %>%
  extract(
    variable,
    into = c("letter", "number"),
    remove = FALSE,
    regex = "([a-z])? ?(?:->\\w*)? ?(\\d+)"
  )
## # A tibble: 4 × 3
##   variable      letter number
##   <chr>         <chr>  <chr> 
## 1 x ->aslkdfj 1 "x"    1     
## 2 y-> 2         "y"    2     
## 3 p 34          "p"    34    
## 4 8             ""     8
# last example
tibble(
  value = c(
    "3.10 = AX",
    "3.1345 = AX:?_40",
    "3.8983 =:$15",
    ".873 = PFS:4"
  )
) %>%
  extract(
    value,
    into = c("v0", "v2", "v3", "v4"),
    regex = "(\\d)?\\.(\\d+) ?= ?(?:(\\w+)?:?)?(?:[?_$]*)(\\d+)?",
    remove = FALSE
  )
## # A tibble: 4 × 5
##   value            v0    v2    v3    v4   
##   <chr>            <chr> <chr> <chr> <chr>
## 1 3.10 = AX        "3"   10    "AX"  ""   
## 2 3.1345 = AX:?_40 "3"   1345  "AX"  "40" 
## 3 3.8983 =:$15     "3"   8983  ""    "15" 
## 4 .873 = PFS:4     ""    873   "PFS" "4"
# another example
results_regexp <- results %>%
  mutate(
    LastName = str_replace_all(Candidate, ".*\\s(.*?)\\,?$", "\\1")
  )
tail(results_regexp)
##                 Candidate    Pct_Vote    LastName
##         John A. Stefanini 0.291895856   Stefanini
##             Dhruba P. Sen 0.009259259         Sen
##            Mark S. Tilden 0.040245691      Tilden
##          Yvonne M. Spicer 0.547029703      Spicer
##  Benjaman A. Neves-Grigg, 0.012284562 Neves-Grigg
##            Priscila Sousa 0.049321599       Sousa

Techniques for anonymizing and pseudoanonymizing columns avoid data breaches that they are potentially dangerous for those affected. For that We will find out how to use the function fct_anon, how to replace names with random names, how to mask values, how to group numeric variables, how to remove house numbers from street names, and how to encode and decode values.The difference between pseudonymization and anonymization is that pseudonymization is reversible, while anonymization is not.

The EU defines pseudonymization as follows:

“The processing of personal data in such a manner that the personal data can no longer be attributed to a specific data subject without the use of additional information provided that such additional information is kept separately and is subject to technical and organisational measures to ensure that the personal data are not attributed to an identified or identifiable natural person.” (https://edps.europa.eu/system/files/2021-04/21-04-27_aepdedps_ anonymisation_en_5.pdf)

By this definition, pseudonymization is reversible and requires additional information to reverse the process.

Sometimes you want to make your data completely anonymous so that other people can’t see sensitive information. A simple function to anonymize such discrete data is fct_anon. The function takes two arguments. The factor you want to anonymize, and the prefix you put in front of the anonymized factor. The numbers are generated randomly. So, each time you run this code, you will get a different set of numbers.

Names are also sensitive data. To anonymize names, you can simply replace them with random names. This can be done with the randomNames function from the randomNames package. You get a different set of names each time you run the function. If we want to be more specific about how the names are generated, we can provide some additional information to the function.

Another common use case is the masking of values. Masking is a technique that hides some characters of a string. Mostly by “X”s. The .x stands for the piped variable (in this case height). Then I provide a regular expression that searches for the last character of the string (.$). This character should then be replaced by an X. The regular expression ^.{10} indicates that we are looking for the first 10 characters of the string. We replace this pattern with 10 “X”s, specified by strrep("X", 10). The function strrep is a basic function of R, which simply repeats a series of characters.

Another common technique for anonymizing data is to divide it into groups. With the function cut_width we can create groups of arbitrary width from a numeric variable. The round bracket means that a number is not included in the set. The square bracket means that a number is included in the set. The function cut_number creates a certain number of sets. Note, however, that the width of each group varies.

Finally, we can anonymize each column by encrypting it. When we encrypt a column, we convert the values of a column into another form, which we call ciphertext. The ciphertext is not readable by humans, but it can be converted back to the original value. There are two forms of encryption. Symmetric encryption, where a single key is used to encrypt and decrypt a value, and asymmetric encryption, where two keys are used to encrypt and decrypt a value. A key is plaintext that translates between the two representations. Once you have the key in symmetric encryption, you can decrypt values. To decrypt values in asymmetric encryption, you need the public key and the private key. The public key is as it says public, so open to anyone. The private key is a key you should not share with anyone. Only when you have both, can you decrypt a value. Also, private key cannot be guessed from the public key. To add another level of security, the private key also sometimes has a passphrase (or password) to it.

We can encrypt this data with the package encryptr. First we need to load the package and create the private and public keys using the genkeys function. The function prompted us to provide a passphrase for the private key. This passphrase and the private key should not be shared with anyone! Once we have the passphrase, we can encrypt our columns. To decrypt the column, we simply use the decrypt function. You must provide the passphrase to decrypt the column. Also, this works only if the R file is in the same directory as the public and private keys.

levels(gss_cat$relig)
##  [1] "No answer"               "Don't know"             
##  [3] "Inter-nondenominational" "Native american"        
##  [5] "Christian"               "Orthodox-christian"     
##  [7] "Moslem/islam"            "Other eastern"          
##  [9] "Hinduism"                "Buddhism"               
## [11] "Other"                   "None"                   
## [13] "Jewish"                  "Catholic"               
## [15] "Protestant"              "Not applicable"
# converting these levels into numeric values and add a prefix to them
gss_cat %>%
  mutate(
    relig = fct_anon(relig, prefix = "religion_")
  ) %>%
  glimpse()
## Rows: 21,483
## Columns: 9
## $ year    <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 20…
## $ marital <fct> Never married, Divorced, Widowed, Never married, Divorced, Mar…
## $ age     <int> 26, 48, 67, 39, 25, 25, 36, 44, 44, 47, 53, 52, 52, 51, 52, 40…
## $ race    <fct> White, White, White, White, White, White, White, White, White,…
## $ rincome <fct> $8000 to 9999, $8000 to 9999, Not applicable, Not applicable, …
## $ partyid <fct> "Ind,near rep", "Not str republican", "Independent", "Ind,near…
## $ relig   <fct> religion_14, religion_14, religion_14, religion_01, religion_0…
## $ denom   <fct> "Southern baptist", "Baptist-dk which", "No denomination", "No…
## $ tvhours <int> 12, NA, 2, 4, 1, NA, 3, NA, 0, 3, 2, NA, 1, NA, 1, 7, NA, 3, 3…
# anonymizing names
presidential %>%
  mutate(
    name = randomNames(nrow(.),
      sample.with.replacement = FALSE
    )
  )
## # A tibble: 11 × 4
##    name                start      end        party     
##    <chr>               <date>     <date>     <chr>     
##  1 Garcia, Deven       1953-01-20 1961-01-20 Republican
##  2 Young, Dejsha       1961-01-20 1963-11-22 Democratic
##  3 Robinson, Dante     1963-11-22 1969-01-20 Democratic
##  4 Cassetta, James     1969-01-20 1974-08-09 Republican
##  5 Kloster, Leeann     1974-08-09 1977-01-20 Republican
##  6 Vossler, Michaelann 1977-01-20 1981-01-20 Democratic
##  7 Rodriguez, Sonali   1981-01-20 1989-01-20 Republican
##  8 Jones, Jordan       1989-01-20 1993-01-20 Republican
##  9 el-Fares, Raihaan   1993-01-20 2001-01-20 Democratic
## 10 Craion, Isaac       2001-01-20 2009-01-20 Republican
## 11 Wilson, Lucero      2009-01-20 2017-01-20 Democratic
# customizing random names
presidential %>%
  mutate(
    name = randomNames(nrow(.),
      sample.with.replacement = FALSE,
      ethnicity = c(1, 2, 4),
      name.order = "first.last",
      name.sep = " "
    )
  )
## # A tibble: 11 × 4
##    name                start      end        party     
##    <chr>               <date>     <date>     <chr>     
##  1 John Ward           1953-01-20 1961-01-20 Republican
##  2 Joshua Xiong        1961-01-20 1963-11-22 Democratic
##  3 Alejandro Soliz     1963-11-22 1969-01-20 Democratic
##  4 Cambria Cdebaca     1969-01-20 1974-08-09 Republican
##  5 Hea Hildreth        1974-08-09 1977-01-20 Republican
##  6 Nancy Garcia        1977-01-20 1981-01-20 Democratic
##  7 Noe Chacon          1981-01-20 1989-01-20 Republican
##  8 Mackenzie Matagaono 1989-01-20 1993-01-20 Republican
##  9 Blanca Godoy        1993-01-20 2001-01-20 Democratic
## 10 Kaile Hammond       2001-01-20 2009-01-20 Republican
## 11 Kevin Blanco        2009-01-20 2017-01-20 Democratic
# masking values
starwars %>%
  mutate(
    height = map_chr(height, ~ str_replace(.x, ".$", "X"))
  )
## # A tibble: 87 × 14
##    name        height  mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##    <chr>       <chr>  <dbl> <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
##  1 Luke Skywa… 17X       77 blond   fair    blue       19   male  mascu… Tatooi…
##  2 C-3PO       16X       75 <NA>    gold    yellow    112   none  mascu… Tatooi…
##  3 R2-D2       9X        32 <NA>    white,… red        33   none  mascu… Naboo  
##  4 Darth Vader 20X      136 none    white   yellow     41.9 male  mascu… Tatooi…
##  5 Leia Organa 15X       49 brown   light   brown      19   fema… femin… Aldera…
##  6 Owen Lars   17X      120 brown,… light   blue       52   male  mascu… Tatooi…
##  7 Beru White… 16X       75 brown   light   blue       47   fema… femin… Tatooi…
##  8 R5-D4       9X        32 <NA>    white,… red        NA   none  mascu… Tatooi…
##  9 Biggs Dark… 18X       84 black   light   brown      24   male  mascu… Tatooi…
## 10 Obi-Wan Ke… 18X       77 auburn… fair    blue-g…    57   male  mascu… Stewjon
## # … with 77 more rows, 4 more variables: species <chr>, films <list>,
## #   vehicles <list>, starships <list>, and abbreviated variable names
## #   ¹​hair_color, ²​skin_color, ³​eye_color, ⁴​birth_year, ⁵​homeworld
# masking more than one character
ccards <- tibble(
  creditcards = c(
    36555224524299,
    36350489667466,
    36002887965170,
    5447552069207504,
    2221002654361034,
    5127699386148536
  )
)

# converting the first 10 characters
ccards %>%
  mutate(
    creditcards_masked = map_chr(creditcards, ~ str_replace(.x, "^.{10}",
      replacement = strrep("X", 10)
    ))
  )
## # A tibble: 6 × 2
##   creditcards creditcards_masked
##         <dbl> <chr>             
## 1     3.66e13 XXXXXXXXXX4299    
## 2     3.64e13 XXXXXXXXXX7466    
## 3     3.60e13 XXXXXXXXXX5170    
## 4     5.45e15 XXXXXXXXXX207504  
## 5     2.22e15 XXXXXXXXXX361034  
## 6     5.13e15 XXXXXXXXXX148536
# converting the last 5 characters
ccards %>%
  mutate(
    creditcars = map_chr(creditcards, ~ str_replace(.x, "\\d{5}$",
      replacement = strrep("X", 5)
    ))
  )
## # A tibble: 6 × 2
##   creditcards creditcars      
##         <dbl> <chr>           
## 1     3.66e13 365552245XXXXX  
## 2     3.64e13 363504896XXXXX  
## 3     3.60e13 360028879XXXXX  
## 4     5.45e15 54475520692XXXXX
## 5     2.22e15 22210026543XXXXX
## 6     5.13e15 51276993861XXXXX
# dividing values into groups
# get the data
(age_starwars <- starwars %>%
  mutate(age = as.integer(format(Sys.Date(), "%Y")) - birth_year) %>%
  select(name, age) %>%
  drop_na(age))
## # A tibble: 43 × 2
##    name                 age
##    <chr>              <dbl>
##  1 Luke Skywalker     2003 
##  2 C-3PO              1910 
##  3 R2-D2              1989 
##  4 Darth Vader        1980.
##  5 Leia Organa        2003 
##  6 Owen Lars          1970 
##  7 Beru Whitesun lars 1975 
##  8 Biggs Darklighter  1998 
##  9 Obi-Wan Kenobi     1965 
## 10 Anakin Skywalker   1980.
## # … with 33 more rows
# using cut_width()
age_starwars %>%
  mutate(
    age_groups = cut_width(age, 10)
  )
## # A tibble: 43 × 3
##    name                 age age_groups 
##    <chr>              <dbl> <fct>      
##  1 Luke Skywalker     2003  (1995,2005]
##  2 C-3PO              1910  (1905,1915]
##  3 R2-D2              1989  (1985,1995]
##  4 Darth Vader        1980. (1975,1985]
##  5 Leia Organa        2003  (1995,2005]
##  6 Owen Lars          1970  (1965,1975]
##  7 Beru Whitesun lars 1975  (1965,1975]
##  8 Biggs Darklighter  1998  (1995,2005]
##  9 Obi-Wan Kenobi     1965  (1955,1965]
## 10 Anakin Skywalker   1980. (1975,1985]
## # … with 33 more rows
# using cut_number()
age_starwars %>%
  mutate(
    age_groups = cut_number(age, 10)
  )
## # A tibble: 43 × 3
##    name                 age age_groups 
##    <chr>              <dbl> <fct>      
##  1 Luke Skywalker     2003  (2001,2014]
##  2 C-3PO              1910  [1126,1922]
##  3 R2-D2              1989  (1981,1991]
##  4 Darth Vader        1980. (1976,1981]
##  5 Leia Organa        2003  (2001,2014]
##  6 Owen Lars          1970  (1965,1970]
##  7 Beru Whitesun lars 1975  (1970,1976]
##  8 Biggs Darklighter  1998  (1991,2001]
##  9 Obi-Wan Kenobi     1965  (1965,1970]
## 10 Anakin Skywalker   1980. (1976,1981]
## # … with 33 more rows
# converting to millennia
age_starwars %>%
  mutate(
    millenium = 1000 * (age %/% 1000)
  )
## # A tibble: 43 × 3
##    name                 age millenium
##    <chr>              <dbl>     <dbl>
##  1 Luke Skywalker     2003       2000
##  2 C-3PO              1910       1000
##  3 R2-D2              1989       1000
##  4 Darth Vader        1980.      1000
##  5 Leia Organa        2003       2000
##  6 Owen Lars          1970       1000
##  7 Beru Whitesun lars 1975       1000
##  8 Biggs Darklighter  1998       1000
##  9 Obi-Wan Kenobi     1965       1000
## 10 Anakin Skywalker   1980.      1000
## # … with 33 more rows
# converting to decades
age_starwars %>%
  mutate(
    decade = 10 * (age %/% 10)
  )
## # A tibble: 43 × 3
##    name                 age decade
##    <chr>              <dbl>  <dbl>
##  1 Luke Skywalker     2003    2000
##  2 C-3PO              1910    1910
##  3 R2-D2              1989    1980
##  4 Darth Vader        1980.   1980
##  5 Leia Organa        2003    2000
##  6 Owen Lars          1970    1970
##  7 Beru Whitesun lars 1975    1970
##  8 Biggs Darklighter  1998    1990
##  9 Obi-Wan Kenobi     1965    1960
## 10 Anakin Skywalker   1980.   1980
## # … with 33 more rows
# removing house numbers from street names
street_names <- tibble(
  street_name = c(
    "Bromley Lanes 34",
    "Woodsgate Avenue 12",
    "Ardconnel Terrace 99",
    "Gipsy Birches 45",
    "Legate Close 8",
    "Stevenson Oval 9",
    "St Leonard's Boulevard 112",
    "Copper Chare 435",
    "Glastonbury Glebe 82",
    "Southern Way 91"
  )
)

street_names %>%
  mutate(
    street_names_no_number = str_remove_all(street_name, "\\d")
  )
## # A tibble: 10 × 2
##    street_name                street_names_no_number   
##    <chr>                      <chr>                    
##  1 Bromley Lanes 34           "Bromley Lanes "         
##  2 Woodsgate Avenue 12        "Woodsgate Avenue "      
##  3 Ardconnel Terrace 99       "Ardconnel Terrace "     
##  4 Gipsy Birches 45           "Gipsy Birches "         
##  5 Legate Close 8             "Legate Close "          
##  6 Stevenson Oval 9           "Stevenson Oval "        
##  7 St Leonard's Boulevard 112 "St Leonard's Boulevard "
##  8 Copper Chare 435           "Copper Chare "          
##  9 Glastonbury Glebe 82       "Glastonbury Glebe "     
## 10 Southern Way 91            "Southern Way "
# encrypting and decrypting columns
users <- tibble(
  name = c("Alexander", "Marie", "John"),
  password = c(12345, "8$43_45*", "becker23#")
)

# genkeys() generates a public and private key pair
# Passphrase: 456#7
# genkeys()

# encrypting a column
# users_encrypted <- users %>%
#   encrypt(password)

# users_encrypted %>%
#   glimpse()

# decrypting the column
# users_encrypted %>% decrypt(password)

4.3 Packages

Sometimes it makes sense to spread an observation over multiple rows (long format), and sometimes it makes more sense to spread a variable across multiple columns (wide format). Some analyses require long data, whereas others require wide data.

When going from a long format to a wide format, you choose columns to group the observations by (in the gapminder case: country and maybe also continent), columns to take values names from (lifeExp, pop and gdpPercap), and columns to create variable names from (year). In data.table, the transformation from long to wide is done using the dcast function.

In data.table, wide-to-long formatting is done using melt.

gm <- as.data.table(gapminder)
head(gm)
##        country continent year lifeExp      pop gdpPercap
## 1: Afghanistan      Asia 1952  28.801  8425333  779.4453
## 2: Afghanistan      Asia 1957  30.332  9240934  820.8530
## 3: Afghanistan      Asia 1962  31.997 10267083  853.1007
## 4: Afghanistan      Asia 1967  34.020 11537966  836.1971
## 5: Afghanistan      Asia 1972  36.088 13079460  739.9811
## 6: Afghanistan      Asia 1977  38.438 14880372  786.1134
# from long to wide
gmw <- dcast(gm, country + continent ~ year, value.var = c("pop", "lifeExp", "gdpPercap"))
head(gmw)
##        country continent pop_1952 pop_1957 pop_1962 pop_1967 pop_1972 pop_1977
## 1: Afghanistan      Asia  8425333  9240934 10267083 11537966 13079460 14880372
## 2:     Albania    Europe  1282697  1476505  1728137  1984060  2263554  2509048
## 3:     Algeria    Africa  9279525 10270856 11000948 12760499 14760787 17152804
## 4:      Angola    Africa  4232095  4561361  4826015  5247469  5894858  6162675
## 5:   Argentina  Americas 17876956 19610538 21283783 22934225 24779799 26983828
## 6:   Australia   Oceania  8691212  9712569 10794968 11872264 13177000 14074100
##    pop_1982 pop_1987 pop_1992 pop_1997 pop_2002 pop_2007 lifeExp_1952
## 1: 12881816 13867957 16317921 22227415 25268405 31889923       28.801
## 2:  2780097  3075321  3326498  3428038  3508512  3600523       55.230
## 3: 20033753 23254956 26298373 29072015 31287142 33333216       43.077
## 4:  7016384  7874230  8735988  9875024 10866106 12420476       30.015
## 5: 29341374 31620918 33958947 36203463 38331121 40301927       62.485
## 6: 15184200 16257249 17481977 18565243 19546792 20434176       69.120
##    lifeExp_1957 lifeExp_1962 lifeExp_1967 lifeExp_1972 lifeExp_1977
## 1:       30.332       31.997       34.020       36.088       38.438
## 2:       59.280       64.820       66.220       67.690       68.930
## 3:       45.685       48.303       51.407       54.518       58.014
## 4:       31.999       34.000       35.985       37.928       39.483
## 5:       64.399       65.142       65.634       67.065       68.481
## 6:       70.330       70.930       71.100       71.930       73.490
##    lifeExp_1982 lifeExp_1987 lifeExp_1992 lifeExp_1997 lifeExp_2002
## 1:       39.854       40.822       41.674       41.763       42.129
## 2:       70.420       72.000       71.581       72.950       75.651
## 3:       61.368       65.799       67.744       69.152       70.994
## 4:       39.942       39.906       40.647       40.963       41.003
## 5:       69.942       70.774       71.868       73.275       74.340
## 6:       74.740       76.320       77.560       78.830       80.370
##    lifeExp_2007 gdpPercap_1952 gdpPercap_1957 gdpPercap_1962 gdpPercap_1967
## 1:       43.828       779.4453        820.853       853.1007       836.1971
## 2:       76.423      1601.0561       1942.284      2312.8890      2760.1969
## 3:       72.301      2449.0082       3013.976      2550.8169      3246.9918
## 4:       42.731      3520.6103       3827.940      4269.2767      5522.7764
## 5:       75.320      5911.3151       6856.856      7133.1660      8052.9530
## 6:       81.235     10039.5956      10949.650     12217.2269     14526.1246
##    gdpPercap_1972 gdpPercap_1977 gdpPercap_1982 gdpPercap_1987 gdpPercap_1992
## 1:       739.9811       786.1134       978.0114       852.3959       649.3414
## 2:      3313.4222      3533.0039      3630.8807      3738.9327      2497.4379
## 3:      4182.6638      4910.4168      5745.1602      5681.3585      5023.2166
## 4:      5473.2880      3008.6474      2756.9537      2430.2083      2627.8457
## 5:      9443.0385     10079.0267      8997.8974      9139.6714      9308.4187
## 6:     16788.6295     18334.1975     19477.0093     21888.8890     23424.7668
##    gdpPercap_1997 gdpPercap_2002 gdpPercap_2007
## 1:       635.3414       726.7341       974.5803
## 2:      3193.0546      4604.2117      5937.0295
## 3:      4797.2951      5288.0404      6223.3675
## 4:      2277.1409      2773.2873      4797.2313
## 5:     10967.2820      8797.6407     12779.3796
## 6:     26997.9366     30687.7547     34435.3674
# wide-to-long (not the best approach!)
gm <- melt(gmw, id.vars = c("country", "continent"), measure.vars = 2:37)
head(gm)
##        country continent  variable    value
## 1: Afghanistan      Asia continent     Asia
## 2:     Albania    Europe continent   Europe
## 3:     Algeria    Africa continent   Africa
## 4:      Angola    Africa continent   Africa
## 5:   Argentina  Americas continent Americas
## 6:   Australia   Oceania continent  Oceania
# splitting columns and casting
gm[, c("variable", "year") := tstrsplit(variable, "_", fixed = TRUE)]
gm <- dcast(gm, country + year ~ variable, value.var = c("value"))
head(gm)
##        country year continent   gdpPercap lifeExp      pop
## 1: Afghanistan <NA>      Asia        <NA>    <NA>     <NA>
## 2: Afghanistan 1952      <NA> 779.4453145  28.801  8425333
## 3: Afghanistan 1957      <NA> 820.8530296  30.332  9240934
## 4: Afghanistan 1962      <NA>   853.10071  31.997 10267083
## 5: Afghanistan 1967      <NA> 836.1971382   34.02 11537966
## 6: Afghanistan 1972      <NA> 739.9811058  36.088 13079460
# merging columns
aq <- as.data.table(airquality)
head(aq)
##    Ozone Solar.R Wind Temp Month Day
## 1:    41     190  7.4   67     5   1
## 2:    36     118  8.0   72     5   2
## 3:    12     149 12.6   74     5   3
## 4:    18     313 11.5   62     5   4
## 5:    NA      NA 14.3   56     5   5
## 6:    28      NA 14.9   66     5   6
# creating a new column Date and mergin Month and Day into it (date format)
aq[, Date := as.Date(paste(1973, aq$Month, aq$Day, sep = "-"))]
head(aq)
##    Ozone Solar.R Wind Temp Month Day       Date
## 1:    41     190  7.4   67     5   1 1973-05-01
## 2:    36     118  8.0   72     5   2 1973-05-02
## 3:    12     149 12.6   74     5   3 1973-05-03
## 4:    18     313 11.5   62     5   4 1973-05-04
## 5:    NA      NA 14.3   56     5   5 1973-05-05
## 6:    28      NA 14.9   66     5   6 1973-05-06

It is common that data is spread over multiple tables. Consequently, it is important to be able to merge data from different tables. The simplest types of merges are binds, which can be used when you have two tables where either the rows or the columns match each other exactly.

An operation that combines columns from two tables is called a join. There are two main types of joins: inner joins and outer joins.

  • Inner joins: create a table containing all observations for which the key appeared in both tables. So if we perform an inner join on the rev_data and weather_data tables using DATE as the key, it won”t contain data for the days that are missing from either the revenue table or the weather table.

In contrast, outer joins create a table retaining rows, even if there is no match in the other table. There are three types of outer joins:

  • Left join: retains all rows from the first table. In the revenue example, this means all dates present in rev_data.
  • Right join: retains all rows from the second table. In the revenue example, this means all dates present in weather_data.
  • Full join: retains all rows present in at least one of the tables. In the revenue example, this means all dates present in at least one of rev_data and weather_data.

Semijoins and antijoins are similar to joins, but work on observations rather than variables. That is, they are used for filtering one table using data from another table:

  • Semijoin: retains all observations in the first table that have a match in the second table.
  • Antijoin: retains all observations in the first table that do not have a match in the second table.

The same thing can be achieved using the filtering techniques, but semijoins and antijoins are simpler to use when the filtering relies on conditions from another table.

# preparing the data
rev_data <- read.csv("input/sales-rev.csv", sep = ";")
weather_data <- read.csv("input/sales-weather.csv", sep = ";")

rev_data <- as.data.table(rev_data)
rev_data$DATE <- as.Date(rev_data$DATE)
weather_data <- as.data.table(weather_data)
weather_data$DATE <- as.Date(weather_data$DATE)

rev_jan <- rev_data[DATE %between% c("2020-01-01", "2020-01-31"), ]
rev_feb <- rev_data[DATE %between% c("2020-02-01", "2020-02-29"), ]
weather_jan <- weather_data[DATE %between% c("2020-01-01", "2020-01-31"), ]

str(rev_jan) 
## Classes 'data.table' and 'data.frame':   31 obs. of  2 variables:
##  $ DATE   : Date, format: "2020-01-01" "2020-01-02" ...
##  $ REVENUE: int  7637 9276 11170 11863 10880 6702 8652 8346 6543 8115 ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(rev_feb) 
## Classes 'data.table' and 'data.frame':   29 obs. of  2 variables:
##  $ DATE   : Date, format: "2020-02-01" "2020-02-02" ...
##  $ REVENUE: int  10192 13904 11208 8578 6638 7093 8187 10099 8160 3797 ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(weather_jan)
## Classes 'data.table' and 'data.frame':   31 obs. of  5 variables:
##  $ DATE         : Date, format: "2020-01-01" "2020-01-02" ...
##  $ SUN_HOURS    : num  0 0.372 0.264 3.549 2.513 ...
##  $ PRECIPITATION: num  0 0 0 0 2.4 0.2 0 0 0.6 0 ...
##  $ SNOW_DEPTH   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TEMPERATURE  : num  0 5.8 5.4 2.4 0.1 4.8 5.5 6.5 3.9 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# with Base-R
# Join columns of datasets that # have the same rows: 
cbind(rev_jan, weather_jan)
##           DATE REVENUE       DATE  SUN_HOURS PRECIPITATION SNOW_DEPTH
##  1: 2020-01-01    7637 2020-01-01 0.00000000           0.0          0
##  2: 2020-01-02    9276 2020-01-02 0.37250000           0.0          0
##  3: 2020-01-03   11170 2020-01-03 0.26361111           0.0          0
##  4: 2020-01-04   11863 2020-01-04 3.54861111           0.0          0
##  5: 2020-01-05   10880 2020-01-05 2.51250000           2.4          0
##  6: 2020-01-06    6702 2020-01-06 0.09333333           0.2          0
##  7: 2020-01-07    8652 2020-01-07 0.00000000           0.0          0
##  8: 2020-01-08    8346 2020-01-08 0.16166667           0.0          0
##  9: 2020-01-09    6543 2020-01-09 0.31944444           0.6          0
## 10: 2020-01-10    8115 2020-01-10 1.58694444           0.0          0
## 11: 2020-01-11    7728 2020-01-11 0.00000000           0.2          0
## 12: 2020-01-12   10649 2020-01-12 0.44194444           0.0          0
## 13: 2020-01-13    6787 2020-01-13 2.42083333           0.0          0
## 14: 2020-01-14    4555 2020-01-14 0.00000000           7.4          0
## 15: 2020-01-15    5885 2020-01-15 0.00000000           0.0          0
## 16: 2020-01-16   10127 2020-01-16 0.85583333           0.0          0
## 17: 2020-01-17    8893 2020-01-17 0.00000000           0.0          0
## 18: 2020-01-18   12520 2020-01-18 0.05500000           0.0          0
## 19: 2020-01-19   11860 2020-01-19 6.18555556           0.0          0
## 20: 2020-01-20    8515 2020-01-20 0.77305556           0.0          0
## 21: 2020-01-21    8129 2020-01-21 2.09777778           0.0          0
## 22: 2020-01-22   10405 2020-01-22 3.55250000           0.0          0
## 23: 2020-01-23    6672 2020-01-23 2.02694444           0.0          0
## 24: 2020-01-24   12300 2020-01-24 4.97361111           0.0          0
## 25: 2020-01-25   10651 2020-01-25 4.45861111           0.0          0
## 26: 2020-01-26   11882 2020-01-26 0.00000000           0.0          0
## 27: 2020-01-27    9397 2020-01-27 0.00000000           0.6          0
## 28: 2020-01-28    5174 2020-01-28 0.00000000          10.9          0
## 29: 2020-01-29    4436 2020-01-29 0.00000000           1.5          0
## 30: 2020-01-30    6202 2020-01-30 0.00000000           0.2          0
## 31: 2020-01-31    9949 2020-01-31 0.00000000           2.0          0
##           DATE REVENUE       DATE  SUN_HOURS PRECIPITATION SNOW_DEPTH
##     TEMPERATURE
##  1:         0.0
##  2:         5.8
##  3:         5.4
##  4:         2.4
##  5:         0.1
##  6:         4.8
##  7:         5.5
##  8:         6.5
##  9:         3.9
## 10:         1.0
## 11:         1.2
## 12:         5.9
## 13:         2.6
## 14:         3.8
## 15:         7.8
## 16:         6.1
## 17:         4.8
## 18:         4.8
## 19:         3.1
## 20:         3.8
## 21:         6.8
## 22:         3.2
## 23:         1.5
## 24:         5.1
## 25:         2.1
## 26:         5.8
## 27:         4.8
## 28:         3.4
## 29:         3.3
## 30:         2.4
## 31:         3.0
##     TEMPERATURE
# Join rows of datasets that have # the same columns: 
rbind(rev_jan, rev_feb)
##           DATE REVENUE
##  1: 2020-01-01    7637
##  2: 2020-01-02    9276
##  3: 2020-01-03   11170
##  4: 2020-01-04   11863
##  5: 2020-01-05   10880
##  6: 2020-01-06    6702
##  7: 2020-01-07    8652
##  8: 2020-01-08    8346
##  9: 2020-01-09    6543
## 10: 2020-01-10    8115
## 11: 2020-01-11    7728
## 12: 2020-01-12   10649
## 13: 2020-01-13    6787
## 14: 2020-01-14    4555
## 15: 2020-01-15    5885
## 16: 2020-01-16   10127
## 17: 2020-01-17    8893
## 18: 2020-01-18   12520
## 19: 2020-01-19   11860
## 20: 2020-01-20    8515
## 21: 2020-01-21    8129
## 22: 2020-01-22   10405
## 23: 2020-01-23    6672
## 24: 2020-01-24   12300
## 25: 2020-01-25   10651
## 26: 2020-01-26   11882
## 27: 2020-01-27    9397
## 28: 2020-01-28    5174
## 29: 2020-01-29    4436
## 30: 2020-01-30    6202
## 31: 2020-01-31    9949
## 32: 2020-02-01   10192
## 33: 2020-02-02   13904
## 34: 2020-02-03   11208
## 35: 2020-02-04    8578
## 36: 2020-02-05    6638
## 37: 2020-02-06    7093
## 38: 2020-02-07    8187
## 39: 2020-02-08   10099
## 40: 2020-02-09    8160
## 41: 2020-02-10    3797
## 42: 2020-02-11    8734
## 43: 2020-02-12    4355
## 44: 2020-02-13    8452
## 45: 2020-02-14    7367
## 46: 2020-02-15    9339
## 47: 2020-02-16    5427
## 48: 2020-02-17    7022
## 49: 2020-02-18    8417
## 50: 2020-02-19    8760
## 51: 2020-02-20    4936
## 52: 2020-02-21   11273
## 53: 2020-02-22    7046
## 54: 2020-02-23   10074
## 55: 2020-02-24    9114
## 56: 2020-02-25    4493
## 57: 2020-02-26    4992
## 58: 2020-02-27    3789
## 59: 2020-02-28    7480
## 60: 2020-02-29    8358
##           DATE REVENUE
# with dplyr
# Join columns of datasets that # have the same rows: 
# bind_cols(rev_jan, weather_jan) 

# Join rows of datasets that have # the same columns: 
# bind_rows(rev_jan, rev_feb)

# merging tables using keys
# data.table
# inner join
merge(rev_data, weather_data, by = "DATE")
##           DATE REVENUE   SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
##  1: 2020-01-01    7637  0.00000000           0.0       0.00         0.0
##  2: 2020-01-02    9276  0.37250000           0.0       0.00         5.8
##  3: 2020-01-03   11170  0.26361111           0.0       0.00         5.4
##  4: 2020-01-04   11863  3.54861111           0.0       0.00         2.4
##  5: 2020-01-05   10880  2.51250000           2.4       0.00         0.1
##  6: 2020-01-06    6702  0.09333333           0.2       0.00         4.8
##  7: 2020-01-07    8652  0.00000000           0.0       0.00         5.5
##  8: 2020-01-08    8346  0.16166667           0.0       0.00         6.5
##  9: 2020-01-09    6543  0.31944444           0.6       0.00         3.9
## 10: 2020-01-10    8115  1.58694444           0.0       0.00         1.0
## 11: 2020-01-11    7728  0.00000000           0.2       0.00         1.2
## 12: 2020-01-12   10649  0.44194444           0.0       0.00         5.9
## 13: 2020-01-13    6787  2.42083333           0.0       0.00         2.6
## 14: 2020-01-14    4555  0.00000000           7.4       0.00         3.8
## 15: 2020-01-15    5885  0.00000000           0.0       0.00         7.8
## 16: 2020-01-16   10127  0.85583333           0.0       0.00         6.1
## 17: 2020-01-17    8893  0.00000000           0.0       0.00         4.8
## 18: 2020-01-18   12520  0.05500000           0.0       0.00         4.8
## 19: 2020-01-19   11860  6.18555556           0.0       0.00         3.1
## 20: 2020-01-20    8515  0.77305556           0.0       0.00         3.8
## 21: 2020-01-21    8129  2.09777778           0.0       0.00         6.8
## 22: 2020-01-22   10405  3.55250000           0.0       0.00         3.2
## 23: 2020-01-23    6672  2.02694444           0.0       0.00         1.5
## 24: 2020-01-24   12300  4.97361111           0.0       0.00         5.1
## 25: 2020-01-25   10651  4.45861111           0.0       0.00         2.1
## 26: 2020-01-26   11882  0.00000000           0.0       0.00         5.8
## 27: 2020-01-27    9397  0.00000000           0.6       0.00         4.8
## 28: 2020-01-28    5174  0.00000000          10.9       0.00         3.4
## 29: 2020-01-29    4436  0.00000000           1.5       0.00         3.3
## 30: 2020-01-30    6202  0.00000000           0.2       0.00         2.4
## 31: 2020-01-31    9949  0.00000000           2.0       0.00         3.0
## 32: 2020-02-01   10192  1.50750000           1.5       0.00         6.1
## 33: 2020-02-02   13904  6.70888889           0.6       0.00         3.8
## 34: 2020-02-03   11208  6.85888889           0.0       0.00         0.5
## 35: 2020-02-04    8578  0.09138889           0.2       0.00        -1.3
## 36: 2020-02-07    8187  1.15611111           0.0       0.00         0.1
## 37: 2020-02-08   10099  0.00000000           0.8       0.00         2.4
## 38: 2020-02-09    8160  1.14805556           6.4       0.00         4.5
## 39: 2020-02-10    3797  0.23944444           0.7       0.00         5.7
## 40: 2020-02-11    8734  0.69166667           0.0       0.00         4.0
## 41: 2020-02-12    4355  1.69333333           1.5       0.00         3.2
## 42: 2020-02-13    8452  8.53333333           0.0       0.00         1.0
## 43: 2020-02-14    7367  6.66250000           0.0       0.00         0.8
## 44: 2020-02-15    9339  0.01416667           4.6       0.00         3.8
## 45: 2020-02-16    5427  0.00000000           1.5       0.00         7.1
## 46: 2020-02-17    7022  1.55083333           0.1       0.00         6.6
## 47: 2020-02-18    8417  2.58805556           0.0       0.00         6.6
## 48: 2020-02-19    8760  7.62638889           0.0       0.00         4.5
## 49: 2020-02-20    4936  3.16527778           5.0       0.00         2.8
## 50: 2020-02-21   11273  7.49055556           0.0       0.00         5.7
## 51: 2020-02-22    7046  0.20055556           0.8       0.00         6.5
## 52: 2020-02-23   10074  6.78666667           0.0       0.00         5.2
## 53: 2020-02-24    9114  8.94694444           0.0       0.00         2.3
## 54: 2020-02-25    4493  2.53583333           1.1       0.00         1.4
## 55: 2020-02-26    4992  0.51944444           0.5       0.00         0.2
## 56: 2020-02-27    3789  0.00000000           0.8       0.01        -1.3
## 57: 2020-02-28    7480  6.55861111           0.2       0.05        -1.3
## 58: 2020-02-29    8358  2.25750000           4.5       0.05         1.1
## 59: 2020-03-02    6489  6.01888889           4.0       0.00         4.4
## 60: 2020-03-03    3586  0.00000000           7.2       0.00         3.7
## 61: 2020-03-04    2570  0.00000000           6.4       0.00         2.1
## 62: 2020-03-05    5059  0.04666667           0.1       0.00         2.1
## 63: 2020-03-06    9087  9.34805556           0.0       0.00         2.0
## 64: 2020-03-07   11570  2.85055556           0.0       0.00         1.9
## 65: 2020-03-08   10476  2.89916667           0.0       0.00         4.5
## 66: 2020-03-09    8977  3.60500000           2.0       0.00         6.8
## 67: 2020-03-11    5837  0.13555556           0.0       0.00         5.6
## 68: 2020-03-12    3559  0.66138889          15.2       0.00         4.9
## 69: 2020-03-13    7489  5.24972222           0.1       0.00         2.1
## 70: 2020-03-14    8138 11.19111111           0.0       0.00        -0.9
## 71: 2020-03-15    8876  5.43888889           2.1       0.00         2.0
## 72: 2020-03-16    4800  6.09472222           2.9       0.00         4.1
## 73: 2020-03-17    3836  0.59083333           0.0       0.00         5.7
## 74: 2020-03-18    2750  1.32916667           5.8       0.00         6.3
## 75: 2020-03-19    7715 11.57500000           0.1       0.00         5.5
## 76: 2020-03-20    4364  3.21694444           0.1       0.00         2.1
## 77: 2020-03-21    9393 10.37333333           0.0       0.00         0.0
## 78: 2020-03-22    7830 11.13527778           0.0       0.00         0.1
## 79: 2020-03-23    4858  6.43416667           0.0       0.00         2.8
## 80: 2020-03-24    5140  5.35000000           0.0       0.00         4.2
## 81: 2020-03-25    7043  6.95083333           0.0       0.00         7.0
## 82: 2020-03-26    8662 11.44222222           0.0       0.00         6.5
## 83: 2020-03-27    9196  9.69944444           0.0       0.00         6.2
## 84: 2020-03-28    8267  3.41638889           0.0       0.00         5.3
## 85: 2020-03-30    4249  7.36500000           0.1       0.00         0.1
## 86: 2020-03-31    8291 10.23666667           0.0       0.00         3.4
##           DATE REVENUE   SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
# Or:
# setkey(rev_data, DATE)
# rev_data[weather_data, nomatch = 0]

# # dplyr
# rev_data %>% 
#   inner_join( weather_data, by = "DATE")

# outer join (left)
# data.table
merge(rev_data, weather_data, all.x = TRUE, by = "DATE") 
##           DATE REVENUE   SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
##  1: 2020-01-01    7637  0.00000000           0.0       0.00         0.0
##  2: 2020-01-02    9276  0.37250000           0.0       0.00         5.8
##  3: 2020-01-03   11170  0.26361111           0.0       0.00         5.4
##  4: 2020-01-04   11863  3.54861111           0.0       0.00         2.4
##  5: 2020-01-05   10880  2.51250000           2.4       0.00         0.1
##  6: 2020-01-06    6702  0.09333333           0.2       0.00         4.8
##  7: 2020-01-07    8652  0.00000000           0.0       0.00         5.5
##  8: 2020-01-08    8346  0.16166667           0.0       0.00         6.5
##  9: 2020-01-09    6543  0.31944444           0.6       0.00         3.9
## 10: 2020-01-10    8115  1.58694444           0.0       0.00         1.0
## 11: 2020-01-11    7728  0.00000000           0.2       0.00         1.2
## 12: 2020-01-12   10649  0.44194444           0.0       0.00         5.9
## 13: 2020-01-13    6787  2.42083333           0.0       0.00         2.6
## 14: 2020-01-14    4555  0.00000000           7.4       0.00         3.8
## 15: 2020-01-15    5885  0.00000000           0.0       0.00         7.8
## 16: 2020-01-16   10127  0.85583333           0.0       0.00         6.1
## 17: 2020-01-17    8893  0.00000000           0.0       0.00         4.8
## 18: 2020-01-18   12520  0.05500000           0.0       0.00         4.8
## 19: 2020-01-19   11860  6.18555556           0.0       0.00         3.1
## 20: 2020-01-20    8515  0.77305556           0.0       0.00         3.8
## 21: 2020-01-21    8129  2.09777778           0.0       0.00         6.8
## 22: 2020-01-22   10405  3.55250000           0.0       0.00         3.2
## 23: 2020-01-23    6672  2.02694444           0.0       0.00         1.5
## 24: 2020-01-24   12300  4.97361111           0.0       0.00         5.1
## 25: 2020-01-25   10651  4.45861111           0.0       0.00         2.1
## 26: 2020-01-26   11882  0.00000000           0.0       0.00         5.8
## 27: 2020-01-27    9397  0.00000000           0.6       0.00         4.8
## 28: 2020-01-28    5174  0.00000000          10.9       0.00         3.4
## 29: 2020-01-29    4436  0.00000000           1.5       0.00         3.3
## 30: 2020-01-30    6202  0.00000000           0.2       0.00         2.4
## 31: 2020-01-31    9949  0.00000000           2.0       0.00         3.0
## 32: 2020-02-01   10192  1.50750000           1.5       0.00         6.1
## 33: 2020-02-02   13904  6.70888889           0.6       0.00         3.8
## 34: 2020-02-03   11208  6.85888889           0.0       0.00         0.5
## 35: 2020-02-04    8578  0.09138889           0.2       0.00        -1.3
## 36: 2020-02-05    6638          NA            NA         NA          NA
## 37: 2020-02-06    7093          NA            NA         NA          NA
## 38: 2020-02-07    8187  1.15611111           0.0       0.00         0.1
## 39: 2020-02-08   10099  0.00000000           0.8       0.00         2.4
## 40: 2020-02-09    8160  1.14805556           6.4       0.00         4.5
## 41: 2020-02-10    3797  0.23944444           0.7       0.00         5.7
## 42: 2020-02-11    8734  0.69166667           0.0       0.00         4.0
## 43: 2020-02-12    4355  1.69333333           1.5       0.00         3.2
## 44: 2020-02-13    8452  8.53333333           0.0       0.00         1.0
## 45: 2020-02-14    7367  6.66250000           0.0       0.00         0.8
## 46: 2020-02-15    9339  0.01416667           4.6       0.00         3.8
## 47: 2020-02-16    5427  0.00000000           1.5       0.00         7.1
## 48: 2020-02-17    7022  1.55083333           0.1       0.00         6.6
## 49: 2020-02-18    8417  2.58805556           0.0       0.00         6.6
## 50: 2020-02-19    8760  7.62638889           0.0       0.00         4.5
## 51: 2020-02-20    4936  3.16527778           5.0       0.00         2.8
## 52: 2020-02-21   11273  7.49055556           0.0       0.00         5.7
## 53: 2020-02-22    7046  0.20055556           0.8       0.00         6.5
## 54: 2020-02-23   10074  6.78666667           0.0       0.00         5.2
## 55: 2020-02-24    9114  8.94694444           0.0       0.00         2.3
## 56: 2020-02-25    4493  2.53583333           1.1       0.00         1.4
## 57: 2020-02-26    4992  0.51944444           0.5       0.00         0.2
## 58: 2020-02-27    3789  0.00000000           0.8       0.01        -1.3
## 59: 2020-02-28    7480  6.55861111           0.2       0.05        -1.3
## 60: 2020-02-29    8358  2.25750000           4.5       0.05         1.1
## 61: 2020-03-02    6489  6.01888889           4.0       0.00         4.4
## 62: 2020-03-03    3586  0.00000000           7.2       0.00         3.7
## 63: 2020-03-04    2570  0.00000000           6.4       0.00         2.1
## 64: 2020-03-05    5059  0.04666667           0.1       0.00         2.1
## 65: 2020-03-06    9087  9.34805556           0.0       0.00         2.0
## 66: 2020-03-07   11570  2.85055556           0.0       0.00         1.9
## 67: 2020-03-08   10476  2.89916667           0.0       0.00         4.5
## 68: 2020-03-09    8977  3.60500000           2.0       0.00         6.8
## 69: 2020-03-10    4509          NA            NA         NA          NA
## 70: 2020-03-11    5837  0.13555556           0.0       0.00         5.6
## 71: 2020-03-12    3559  0.66138889          15.2       0.00         4.9
## 72: 2020-03-13    7489  5.24972222           0.1       0.00         2.1
## 73: 2020-03-14    8138 11.19111111           0.0       0.00        -0.9
## 74: 2020-03-15    8876  5.43888889           2.1       0.00         2.0
## 75: 2020-03-16    4800  6.09472222           2.9       0.00         4.1
## 76: 2020-03-17    3836  0.59083333           0.0       0.00         5.7
## 77: 2020-03-18    2750  1.32916667           5.8       0.00         6.3
## 78: 2020-03-19    7715 11.57500000           0.1       0.00         5.5
## 79: 2020-03-20    4364  3.21694444           0.1       0.00         2.1
## 80: 2020-03-21    9393 10.37333333           0.0       0.00         0.0
## 81: 2020-03-22    7830 11.13527778           0.0       0.00         0.1
## 82: 2020-03-23    4858  6.43416667           0.0       0.00         2.8
## 83: 2020-03-24    5140  5.35000000           0.0       0.00         4.2
## 84: 2020-03-25    7043  6.95083333           0.0       0.00         7.0
## 85: 2020-03-26    8662 11.44222222           0.0       0.00         6.5
## 86: 2020-03-27    9196  9.69944444           0.0       0.00         6.2
## 87: 2020-03-28    8267  3.41638889           0.0       0.00         5.3
## 88: 2020-03-29    8237          NA            NA         NA          NA
## 89: 2020-03-30    4249  7.36500000           0.1       0.00         0.1
## 90: 2020-03-31    8291 10.23666667           0.0       0.00         3.4
##           DATE REVENUE   SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
# Or: 
# setkey(weather_data, DATE) 
# weather_data[rev_data]

# dplyr
# rev_data %>% 
#   left_join( weather_data, by = "DATE")

# outer join (right)
# data.table
merge(rev_data, weather_data, all.y = TRUE, by = "DATE") 
##           DATE REVENUE   SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
##  1: 2020-01-01    7637  0.00000000           0.0       0.00         0.0
##  2: 2020-01-02    9276  0.37250000           0.0       0.00         5.8
##  3: 2020-01-03   11170  0.26361111           0.0       0.00         5.4
##  4: 2020-01-04   11863  3.54861111           0.0       0.00         2.4
##  5: 2020-01-05   10880  2.51250000           2.4       0.00         0.1
##  6: 2020-01-06    6702  0.09333333           0.2       0.00         4.8
##  7: 2020-01-07    8652  0.00000000           0.0       0.00         5.5
##  8: 2020-01-08    8346  0.16166667           0.0       0.00         6.5
##  9: 2020-01-09    6543  0.31944444           0.6       0.00         3.9
## 10: 2020-01-10    8115  1.58694444           0.0       0.00         1.0
## 11: 2020-01-11    7728  0.00000000           0.2       0.00         1.2
## 12: 2020-01-12   10649  0.44194444           0.0       0.00         5.9
## 13: 2020-01-13    6787  2.42083333           0.0       0.00         2.6
## 14: 2020-01-14    4555  0.00000000           7.4       0.00         3.8
## 15: 2020-01-15    5885  0.00000000           0.0       0.00         7.8
## 16: 2020-01-16   10127  0.85583333           0.0       0.00         6.1
## 17: 2020-01-17    8893  0.00000000           0.0       0.00         4.8
## 18: 2020-01-18   12520  0.05500000           0.0       0.00         4.8
## 19: 2020-01-19   11860  6.18555556           0.0       0.00         3.1
## 20: 2020-01-20    8515  0.77305556           0.0       0.00         3.8
## 21: 2020-01-21    8129  2.09777778           0.0       0.00         6.8
## 22: 2020-01-22   10405  3.55250000           0.0       0.00         3.2
## 23: 2020-01-23    6672  2.02694444           0.0       0.00         1.5
## 24: 2020-01-24   12300  4.97361111           0.0       0.00         5.1
## 25: 2020-01-25   10651  4.45861111           0.0       0.00         2.1
## 26: 2020-01-26   11882  0.00000000           0.0       0.00         5.8
## 27: 2020-01-27    9397  0.00000000           0.6       0.00         4.8
## 28: 2020-01-28    5174  0.00000000          10.9       0.00         3.4
## 29: 2020-01-29    4436  0.00000000           1.5       0.00         3.3
## 30: 2020-01-30    6202  0.00000000           0.2       0.00         2.4
## 31: 2020-01-31    9949  0.00000000           2.0       0.00         3.0
## 32: 2020-02-01   10192  1.50750000           1.5       0.00         6.1
## 33: 2020-02-02   13904  6.70888889           0.6       0.00         3.8
## 34: 2020-02-03   11208  6.85888889           0.0       0.00         0.5
## 35: 2020-02-04    8578  0.09138889           0.2       0.00        -1.3
## 36: 2020-02-07    8187  1.15611111           0.0       0.00         0.1
## 37: 2020-02-08   10099  0.00000000           0.8       0.00         2.4
## 38: 2020-02-09    8160  1.14805556           6.4       0.00         4.5
## 39: 2020-02-10    3797  0.23944444           0.7       0.00         5.7
## 40: 2020-02-11    8734  0.69166667           0.0       0.00         4.0
## 41: 2020-02-12    4355  1.69333333           1.5       0.00         3.2
## 42: 2020-02-13    8452  8.53333333           0.0       0.00         1.0
## 43: 2020-02-14    7367  6.66250000           0.0       0.00         0.8
## 44: 2020-02-15    9339  0.01416667           4.6       0.00         3.8
## 45: 2020-02-16    5427  0.00000000           1.5       0.00         7.1
## 46: 2020-02-17    7022  1.55083333           0.1       0.00         6.6
## 47: 2020-02-18    8417  2.58805556           0.0       0.00         6.6
## 48: 2020-02-19    8760  7.62638889           0.0       0.00         4.5
## 49: 2020-02-20    4936  3.16527778           5.0       0.00         2.8
## 50: 2020-02-21   11273  7.49055556           0.0       0.00         5.7
## 51: 2020-02-22    7046  0.20055556           0.8       0.00         6.5
## 52: 2020-02-23   10074  6.78666667           0.0       0.00         5.2
## 53: 2020-02-24    9114  8.94694444           0.0       0.00         2.3
## 54: 2020-02-25    4493  2.53583333           1.1       0.00         1.4
## 55: 2020-02-26    4992  0.51944444           0.5       0.00         0.2
## 56: 2020-02-27    3789  0.00000000           0.8       0.01        -1.3
## 57: 2020-02-28    7480  6.55861111           0.2       0.05        -1.3
## 58: 2020-02-29    8358  2.25750000           4.5       0.05         1.1
## 59: 2020-03-01      NA  0.91555556           1.0       0.00         4.2
## 60: 2020-03-02    6489  6.01888889           4.0       0.00         4.4
## 61: 2020-03-03    3586  0.00000000           7.2       0.00         3.7
## 62: 2020-03-04    2570  0.00000000           6.4       0.00         2.1
## 63: 2020-03-05    5059  0.04666667           0.1       0.00         2.1
## 64: 2020-03-06    9087  9.34805556           0.0       0.00         2.0
## 65: 2020-03-07   11570  2.85055556           0.0       0.00         1.9
## 66: 2020-03-08   10476  2.89916667           0.0       0.00         4.5
## 67: 2020-03-09    8977  3.60500000           2.0       0.00         6.8
## 68: 2020-03-11    5837  0.13555556           0.0       0.00         5.6
## 69: 2020-03-12    3559  0.66138889          15.2       0.00         4.9
## 70: 2020-03-13    7489  5.24972222           0.1       0.00         2.1
## 71: 2020-03-14    8138 11.19111111           0.0       0.00        -0.9
## 72: 2020-03-15    8876  5.43888889           2.1       0.00         2.0
## 73: 2020-03-16    4800  6.09472222           2.9       0.00         4.1
## 74: 2020-03-17    3836  0.59083333           0.0       0.00         5.7
## 75: 2020-03-18    2750  1.32916667           5.8       0.00         6.3
## 76: 2020-03-19    7715 11.57500000           0.1       0.00         5.5
## 77: 2020-03-20    4364  3.21694444           0.1       0.00         2.1
## 78: 2020-03-21    9393 10.37333333           0.0       0.00         0.0
## 79: 2020-03-22    7830 11.13527778           0.0       0.00         0.1
## 80: 2020-03-23    4858  6.43416667           0.0       0.00         2.8
## 81: 2020-03-24    5140  5.35000000           0.0       0.00         4.2
## 82: 2020-03-25    7043  6.95083333           0.0       0.00         7.0
## 83: 2020-03-26    8662 11.44222222           0.0       0.00         6.5
## 84: 2020-03-27    9196  9.69944444           0.0       0.00         6.2
## 85: 2020-03-28    8267  3.41638889           0.0       0.00         5.3
## 86: 2020-03-30    4249  7.36500000           0.1       0.00         0.1
## 87: 2020-03-31    8291 10.23666667           0.0       0.00         3.4
##           DATE REVENUE   SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
# Or: 
# setkey(rev_data, DATE) 
# rev_data[weather_data]

# dplyr
# rev_data %>%
#   right_join(weather_data, by = "DATE")

# full join
# data.table
merge(rev_data, weather_data, all = TRUE, by = "DATE")
##           DATE REVENUE   SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
##  1: 2020-01-01    7637  0.00000000           0.0       0.00         0.0
##  2: 2020-01-02    9276  0.37250000           0.0       0.00         5.8
##  3: 2020-01-03   11170  0.26361111           0.0       0.00         5.4
##  4: 2020-01-04   11863  3.54861111           0.0       0.00         2.4
##  5: 2020-01-05   10880  2.51250000           2.4       0.00         0.1
##  6: 2020-01-06    6702  0.09333333           0.2       0.00         4.8
##  7: 2020-01-07    8652  0.00000000           0.0       0.00         5.5
##  8: 2020-01-08    8346  0.16166667           0.0       0.00         6.5
##  9: 2020-01-09    6543  0.31944444           0.6       0.00         3.9
## 10: 2020-01-10    8115  1.58694444           0.0       0.00         1.0
## 11: 2020-01-11    7728  0.00000000           0.2       0.00         1.2
## 12: 2020-01-12   10649  0.44194444           0.0       0.00         5.9
## 13: 2020-01-13    6787  2.42083333           0.0       0.00         2.6
## 14: 2020-01-14    4555  0.00000000           7.4       0.00         3.8
## 15: 2020-01-15    5885  0.00000000           0.0       0.00         7.8
## 16: 2020-01-16   10127  0.85583333           0.0       0.00         6.1
## 17: 2020-01-17    8893  0.00000000           0.0       0.00         4.8
## 18: 2020-01-18   12520  0.05500000           0.0       0.00         4.8
## 19: 2020-01-19   11860  6.18555556           0.0       0.00         3.1
## 20: 2020-01-20    8515  0.77305556           0.0       0.00         3.8
## 21: 2020-01-21    8129  2.09777778           0.0       0.00         6.8
## 22: 2020-01-22   10405  3.55250000           0.0       0.00         3.2
## 23: 2020-01-23    6672  2.02694444           0.0       0.00         1.5
## 24: 2020-01-24   12300  4.97361111           0.0       0.00         5.1
## 25: 2020-01-25   10651  4.45861111           0.0       0.00         2.1
## 26: 2020-01-26   11882  0.00000000           0.0       0.00         5.8
## 27: 2020-01-27    9397  0.00000000           0.6       0.00         4.8
## 28: 2020-01-28    5174  0.00000000          10.9       0.00         3.4
## 29: 2020-01-29    4436  0.00000000           1.5       0.00         3.3
## 30: 2020-01-30    6202  0.00000000           0.2       0.00         2.4
## 31: 2020-01-31    9949  0.00000000           2.0       0.00         3.0
## 32: 2020-02-01   10192  1.50750000           1.5       0.00         6.1
## 33: 2020-02-02   13904  6.70888889           0.6       0.00         3.8
## 34: 2020-02-03   11208  6.85888889           0.0       0.00         0.5
## 35: 2020-02-04    8578  0.09138889           0.2       0.00        -1.3
## 36: 2020-02-05    6638          NA            NA         NA          NA
## 37: 2020-02-06    7093          NA            NA         NA          NA
## 38: 2020-02-07    8187  1.15611111           0.0       0.00         0.1
## 39: 2020-02-08   10099  0.00000000           0.8       0.00         2.4
## 40: 2020-02-09    8160  1.14805556           6.4       0.00         4.5
## 41: 2020-02-10    3797  0.23944444           0.7       0.00         5.7
## 42: 2020-02-11    8734  0.69166667           0.0       0.00         4.0
## 43: 2020-02-12    4355  1.69333333           1.5       0.00         3.2
## 44: 2020-02-13    8452  8.53333333           0.0       0.00         1.0
## 45: 2020-02-14    7367  6.66250000           0.0       0.00         0.8
## 46: 2020-02-15    9339  0.01416667           4.6       0.00         3.8
## 47: 2020-02-16    5427  0.00000000           1.5       0.00         7.1
## 48: 2020-02-17    7022  1.55083333           0.1       0.00         6.6
## 49: 2020-02-18    8417  2.58805556           0.0       0.00         6.6
## 50: 2020-02-19    8760  7.62638889           0.0       0.00         4.5
## 51: 2020-02-20    4936  3.16527778           5.0       0.00         2.8
## 52: 2020-02-21   11273  7.49055556           0.0       0.00         5.7
## 53: 2020-02-22    7046  0.20055556           0.8       0.00         6.5
## 54: 2020-02-23   10074  6.78666667           0.0       0.00         5.2
## 55: 2020-02-24    9114  8.94694444           0.0       0.00         2.3
## 56: 2020-02-25    4493  2.53583333           1.1       0.00         1.4
## 57: 2020-02-26    4992  0.51944444           0.5       0.00         0.2
## 58: 2020-02-27    3789  0.00000000           0.8       0.01        -1.3
## 59: 2020-02-28    7480  6.55861111           0.2       0.05        -1.3
## 60: 2020-02-29    8358  2.25750000           4.5       0.05         1.1
## 61: 2020-03-01      NA  0.91555556           1.0       0.00         4.2
## 62: 2020-03-02    6489  6.01888889           4.0       0.00         4.4
## 63: 2020-03-03    3586  0.00000000           7.2       0.00         3.7
## 64: 2020-03-04    2570  0.00000000           6.4       0.00         2.1
## 65: 2020-03-05    5059  0.04666667           0.1       0.00         2.1
## 66: 2020-03-06    9087  9.34805556           0.0       0.00         2.0
## 67: 2020-03-07   11570  2.85055556           0.0       0.00         1.9
## 68: 2020-03-08   10476  2.89916667           0.0       0.00         4.5
## 69: 2020-03-09    8977  3.60500000           2.0       0.00         6.8
## 70: 2020-03-10    4509          NA            NA         NA          NA
## 71: 2020-03-11    5837  0.13555556           0.0       0.00         5.6
## 72: 2020-03-12    3559  0.66138889          15.2       0.00         4.9
## 73: 2020-03-13    7489  5.24972222           0.1       0.00         2.1
## 74: 2020-03-14    8138 11.19111111           0.0       0.00        -0.9
## 75: 2020-03-15    8876  5.43888889           2.1       0.00         2.0
## 76: 2020-03-16    4800  6.09472222           2.9       0.00         4.1
## 77: 2020-03-17    3836  0.59083333           0.0       0.00         5.7
## 78: 2020-03-18    2750  1.32916667           5.8       0.00         6.3
## 79: 2020-03-19    7715 11.57500000           0.1       0.00         5.5
## 80: 2020-03-20    4364  3.21694444           0.1       0.00         2.1
## 81: 2020-03-21    9393 10.37333333           0.0       0.00         0.0
## 82: 2020-03-22    7830 11.13527778           0.0       0.00         0.1
## 83: 2020-03-23    4858  6.43416667           0.0       0.00         2.8
## 84: 2020-03-24    5140  5.35000000           0.0       0.00         4.2
## 85: 2020-03-25    7043  6.95083333           0.0       0.00         7.0
## 86: 2020-03-26    8662 11.44222222           0.0       0.00         6.5
## 87: 2020-03-27    9196  9.69944444           0.0       0.00         6.2
## 88: 2020-03-28    8267  3.41638889           0.0       0.00         5.3
## 89: 2020-03-29    8237          NA            NA         NA          NA
## 90: 2020-03-30    4249  7.36500000           0.1       0.00         0.1
## 91: 2020-03-31    8291 10.23666667           0.0       0.00         3.4
##           DATE REVENUE   SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
# dplyr
# rev_data %>% 
#   full_join( weather_data, by = "DATE")

# another example
mayordata <- full_join(contributions_split, results_split, by = "LastName")
str(mayordata)
## Classes 'tabyl' and 'data.frame':    7 obs. of  3 variables:
##  $ LastName              : chr  "Horrigan" "Neves-Grigg" "Sen" "Sousa" ...
##  $ Pct_Local_Contributors: num  0.03582 0.01194 0.00896 0.02985 0.51642 ...
##  $ Pct_Vote              : num  0.04996 0.01228 0.00926 0.04932 0.54703 ...
head(mayordata)
##     LastName Pct_Local_Contributors    Pct_Vote
##     Horrigan            0.035820896 0.049963330
##  Neves-Grigg            0.011940299 0.012284562
##          Sen            0.008955224 0.009259259
##        Sousa            0.029850746 0.049321599
##       Spicer            0.516417910 0.547029703
##    Stefanini            0.337313433 0.291895856
# preparing the data
filter_data <- weather_data[TEMPERATURE < 0 & DATE %between% c("2020-02-01", "2020-02-29"), ]
head(filter_data)
##          DATE  SUN_HOURS PRECIPITATION SNOW_DEPTH TEMPERATURE
## 1: 2020-02-04 0.09138889           0.2       0.00        -1.3
## 2: 2020-02-27 0.00000000           0.8       0.01        -1.3
## 3: 2020-02-28 6.55861111           0.2       0.05        -1.3
# using a semijoin
# data.table
setkey(rev_data, DATE)
rev_data[rev_data[filter_data, which = TRUE]]
##          DATE REVENUE
## 1: 2020-02-04    8578
## 2: 2020-02-27    3789
## 3: 2020-02-28    7480
# dplyr
# rev_data %>%
#   semi_join(filter_data, by = "DATE")

# antijoin
# data.table
setkey(rev_data, DATE)
rev_data[!filter_data]
##           DATE REVENUE
##  1: 2020-01-01    7637
##  2: 2020-01-02    9276
##  3: 2020-01-03   11170
##  4: 2020-01-04   11863
##  5: 2020-01-05   10880
##  6: 2020-01-06    6702
##  7: 2020-01-07    8652
##  8: 2020-01-08    8346
##  9: 2020-01-09    6543
## 10: 2020-01-10    8115
## 11: 2020-01-11    7728
## 12: 2020-01-12   10649
## 13: 2020-01-13    6787
## 14: 2020-01-14    4555
## 15: 2020-01-15    5885
## 16: 2020-01-16   10127
## 17: 2020-01-17    8893
## 18: 2020-01-18   12520
## 19: 2020-01-19   11860
## 20: 2020-01-20    8515
## 21: 2020-01-21    8129
## 22: 2020-01-22   10405
## 23: 2020-01-23    6672
## 24: 2020-01-24   12300
## 25: 2020-01-25   10651
## 26: 2020-01-26   11882
## 27: 2020-01-27    9397
## 28: 2020-01-28    5174
## 29: 2020-01-29    4436
## 30: 2020-01-30    6202
## 31: 2020-01-31    9949
## 32: 2020-02-01   10192
## 33: 2020-02-02   13904
## 34: 2020-02-03   11208
## 35: 2020-02-05    6638
## 36: 2020-02-06    7093
## 37: 2020-02-07    8187
## 38: 2020-02-08   10099
## 39: 2020-02-09    8160
## 40: 2020-02-10    3797
## 41: 2020-02-11    8734
## 42: 2020-02-12    4355
## 43: 2020-02-13    8452
## 44: 2020-02-14    7367
## 45: 2020-02-15    9339
## 46: 2020-02-16    5427
## 47: 2020-02-17    7022
## 48: 2020-02-18    8417
## 49: 2020-02-19    8760
## 50: 2020-02-20    4936
## 51: 2020-02-21   11273
## 52: 2020-02-22    7046
## 53: 2020-02-23   10074
## 54: 2020-02-24    9114
## 55: 2020-02-25    4493
## 56: 2020-02-26    4992
## 57: 2020-02-29    8358
## 58: 2020-03-02    6489
## 59: 2020-03-03    3586
## 60: 2020-03-04    2570
## 61: 2020-03-05    5059
## 62: 2020-03-06    9087
## 63: 2020-03-07   11570
## 64: 2020-03-08   10476
## 65: 2020-03-09    8977
## 66: 2020-03-10    4509
## 67: 2020-03-11    5837
## 68: 2020-03-12    3559
## 69: 2020-03-13    7489
## 70: 2020-03-14    8138
## 71: 2020-03-15    8876
## 72: 2020-03-16    4800
## 73: 2020-03-17    3836
## 74: 2020-03-18    2750
## 75: 2020-03-19    7715
## 76: 2020-03-20    4364
## 77: 2020-03-21    9393
## 78: 2020-03-22    7830
## 79: 2020-03-23    4858
## 80: 2020-03-24    5140
## 81: 2020-03-25    7043
## 82: 2020-03-26    8662
## 83: 2020-03-27    9196
## 84: 2020-03-28    8267
## 85: 2020-03-29    8237
## 86: 2020-03-30    4249
## 87: 2020-03-31    8291
##           DATE REVENUE
# dplyr
# rev_data %>%
#   anti_join(filter_data, by = "DATE")

5 Summarizing data sets

5.1 Base-R

# returns column index names in table format
data.frame(colnames(snowdata))
##   colnames.snowdata.
## 1             Winter
## 2         SnowInches
## 3         SnowMeters
# returns row index numbers in table format
data.frame(as.integer(rownames(snowdata)))
##    as.integer.rownames.snowdata..
## 1                               1
## 2                               2
## 3                               3
## 4                               4
## 5                               5
## 6                               6
## 7                               7
## 8                               8
## 9                               9
## 10                             10
## 11                             11
## 12                             12
## 13                             13
## 14                             14
## 15                             15
## 16                             16
## 17                             17
## 18                             18
## 19                             19
## 20                             20
## 21                             21
## 22                             22
## 23                             23
## 24                             24
## 25                             25
## 26                             26
## 27                             27
## 28                             28
## 29                             29
## 30                             30
## 31                             31
## 32                             32
## 33                             33
## 34                             34
## 35                             35
## 36                             36
## 37                             37
## 38                             38
## 39                             39
## 40                             40
## 41                             41
## 42                             42
## 43                             43
## 44                             44
## 45                             45
## 46                             46
## 47                             47
## 48                             48
## 49                             49
## 50                             50
## 51                             51
## 52                             52
## 53                             53
## 54                             54
## 55                             55
## 56                             56
## 57                             57
## 58                             58
## 59                             59
## 60                             60
## 61                             61
## 62                             62
## 63                             63
## 64                             64
## 65                             65
## 66                             66
## 67                             67
## 68                             68
## 69                             69
## 70                             70
## 71                             71
## 72                             72
## 73                             73
## 74                             74
## 75                             75
## 76                             76
# getting a sense of the data set
str(snowdata)
## 'data.frame':    76 obs. of  3 variables:
##  $ Winter    : chr  "1940-1941" "1941-1942" "1942-1943" "1943-1944" ...
##  $ SnowInches: num  47.8 23.9 45.7 27.7 59.2 50.8 19.4 89.2 37.1 32 ...
##  $ SnowMeters: num  1.214 0.607 1.161 0.704 1.504 ...
# showing the number of rows and columns...
dim(snowdata)
## [1] 76  3
nrow(snowdata)
## [1] 76
ncol(snowdata)
## [1] 3
# ...and names 
dimnames(snowdata)
## [[1]]
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
## [76] "76"
## 
## [[2]]
## [1] "Winter"     "SnowInches" "SnowMeters"
rownames(snowdata)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
## [76] "76"
colnames(snowdata)
## [1] "Winter"     "SnowInches" "SnowMeters"
# a brief statistical summary of a data set, run the summary() function
summary(snowdata)
##     Winter            SnowInches       SnowMeters    
##  Length:76          Min.   :  9.30   Min.   :0.2362  
##  Class :character   1st Qu.: 27.57   1st Qu.:0.7004  
##  Mode  :character   Median : 42.75   Median :1.0858  
##                     Mean   : 44.49   Mean   :1.1299  
##                     3rd Qu.: 57.60   3rd Qu.:1.4630  
##                     Max.   :110.60   Max.   :2.8092

5.2 Tidyverse

glimpse(snowdata)
## Rows: 76
## Columns: 3
## $ Winter     <chr> "1940-1941", "1941-1942", "1942-1943", "1943-1944", "1944-1…
## $ SnowInches <dbl> 47.8, 23.9, 45.7, 27.7, 59.2, 50.8, 19.4, 89.2, 37.1, 32.0,…
## $ SnowMeters <dbl> 1.21412, 0.60706, 1.16078, 0.70358, 1.50368, 1.29032, 0.492…

5.3 Packages

Find and concisely describe the difference between a pair of R objects with waldo package. One of the first things worth doing after importing a data set is looking at the first few rows, the last few rows, and a summary of some basic stats. This can be easily achieved thanks to the headTail() function of the psych package. To get a brief statistical summary you can use the describe() function of the Hmisc package or the psych package (only works for numeric data). The skimr package’s skim() function will show information on each column, including a little histogram for each numeric one.

df1 <- data.frame(X = c(1, 2, 3), Y = c("a", "b", "c"), A = c(3, 4, 5))
df2 <- data.frame(X = c(1, 2, 3, 4), Y = c("A", "b", "c", "d"), Z = c("k", "l", "m", "n"), A = c("3", "4", "5", "6"))
waldo::compare(df1, df2)
## `old` is length 3
## `new` is length 4
## 
## `names(old)`: "X" "Y" "A"    
## `names(new)`: "X" "Y" "Z" "A"
## 
## `attr(old, 'row.names')`: 1 2 3  
## `attr(new, 'row.names')`: 1 2 3 4
## 
## `old$X`: 1 2 3  
## `new$X`: 1 2 3 4
## 
## `old$Y[2:3]`:         "b" "c"
## `new$Y`:      "A" "b" "c" "d"
## 
## `old$A` is a double vector (3, 4, 5)
## `new$A` is a character vector ('3', '4', '5', '6')
## 
## `old$Z` is absent
## `new$Z` is a character vector ('k', 'l', 'm', 'n')
# getting the first and last rows
headTail(snowdata)
##        Winter SnowInches SnowMeters
## 1   1940-1941       47.8       1.21
## 2   1941-1942       23.9       0.61
## 3   1942-1943       45.7       1.16
## 4   1943-1944       27.7        0.7
## ...      <NA>        ...        ...
## 73  2012-2013       63.4       1.61
## 74  2013-2014       58.9        1.5
## 75  2014-2015      110.6       2.81
## 76  2015-2016       36.2       0.92
# getting statistical info
Hmisc::describe(snowdata)
## snowdata 
## 
##  3  Variables      76  Observations
## --------------------------------------------------------------------------------
## Winter 
##        n  missing distinct 
##       76        0       76 
## 
## lowest : 1940-1941 1941-1942 1942-1943 1943-1944 1944-1945
## highest: 2011-2012 2012-2013 2013-2014 2014-2015 2015-2016
## --------------------------------------------------------------------------------
## SnowInches 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       76        0       75        1    44.49    24.92    15.05    18.60 
##      .25      .50      .75      .90      .95 
##    27.58    42.75    57.60    75.95    87.25 
## 
## lowest :   9.3  10.3  12.5  14.9  15.1, highest:  86.6  89.2  96.3 107.6 110.6
## --------------------------------------------------------------------------------
## SnowMeters 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       76        0       75        1     1.13   0.6329   0.3823   0.4724 
##      .25      .50      .75      .90      .95 
##   0.7004   1.0858   1.4630   1.9291   2.2161 
## 
## lowest : 0.23622 0.26162 0.31750 0.37846 0.38354
## highest: 2.19964 2.26568 2.44602 2.73304 2.80924
## --------------------------------------------------------------------------------
psych::describe(snowdata)
##            vars  n  mean    sd median trimmed   mad  min    max  range skew
## Winter*       1 76 38.50 22.08  38.50   38.50 28.17 1.00  76.00  75.00 0.00
## SnowInches    2 76 44.49 22.51  42.75   42.37 22.54 9.30 110.60 101.30 0.84
## SnowMeters    3 76  1.13  0.57   1.09    1.08  0.57 0.24   2.81   2.57 0.84
##            kurtosis   se
## Winter*       -1.25 2.53
## SnowInches     0.46 2.58
## SnowMeters     0.46 0.07
skim(snowdata)
Data summary
Name snowdata
Number of rows 76
Number of columns 3
_______________________
Column type frequency:
character 1
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Winter 0 1 9 9 0 76 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SnowInches 0 1 44.49 22.51 9.30 27.58 42.75 57.60 110.60 ▆▇▅▂▁
SnowMeters 0 1 1.13 0.57 0.24 0.70 1.09 1.46 2.81 ▆▇▅▂▁

6 Subsetting and modifying data frames

6.1 Base-R

For objects that contain more than one element (vectors, matrices, arrays, data frames, and lists), subscripting is used to access some or all of those elements. Besides the usual numeric subscripts, R allows the use of character or logical values for subscripting. Subscripting operations are very fast and efficient, and are often the most powerful tool for accessing and manipulating data in R.

Like most computer languages, numeric subscripts can be used to access the elements of a vector, array, or list. The first element of an object has subscript 1; subscripts of 0 are silently ignored. In addition to a single number, a vector of subscripts (or, for example, a function call that returns a vector of subscripts) can be used to access multiple elements. The colon operator and the seq function are especially useful here. Negative subscripts in R extract all of the elements of an object except the ones specified in the negative subscript; thus, when using numeric subscripts, subscripts must be either all positive (or zero) or all negative (or zero).

If a subscriptable object is named, a character string or vector of character strings can be used as a subscript. Negative character subscripts are not permitted; if you need to exclude elements based on their names, the grep function can be used.

Logical values can be used to selectively access elements of a subscriptable object, provided the size of the logical object is the same as the object (or part of the object) that is being subscripted. Elements corresponding to TRUE values in the logical vector will be included, and objects corresponding to FALSE values will not. Logical subscripting provides a very powerful and simple way to perform tasks that might otherwise require loops. Like most operations in R, logical operators are vectorized; applying a logical subscript to a vector or an array will produce an object of the same size and shape as the original object. To find the indices of elements, R provides the which function, which accepts a logical vector, and returns a vector containing the subscripts of the elements for which the logical vector was true. Logical subscripts allow for modification of elements that meet a particular condition by using an appropriately subscripted object on the left-hand side of an assignment statement.

Lists are the most general way to store a collection of objects in R, because there is no limitation on the mode of the objects that a list may hold. Although it hasn”t been explicitly stated, one rule of subscripting in R is that subscripting will always return an object of the same mode as the object being subscripted. For matrices and vectors, this is completely natural, and should never cause confusion. But for lists, there is a subtle distinction between part of a list, and the object which that part of the list represents.

If the elements of the list are named, the actual contents of the elements can be accessed by separating the name of the list from the name of the element with a dollar sign ($). For interactive sessions, using the dollar sign notation is the natural way to perform operations on the elements of a list. For those situations where the dollar sign notation would be inappropriate (for example, accessing elements through their index or through a name stored in a character variable), R provides the double bracket subscript operator. Double brackets are not restricted to respect the mode of the object they are subscripting, and will extract the actual list element from the list.

The key thing to notice is that in this case, single brackets will always return a list containing the selected element(s), while double brackets will return the actual contents of selected list element.

Since data frames are a cross between a list and a matrix, it”s not surprising that both matrix and list subscripting techniques apply to data frames. When using logical subscripts with data frames containing missing values, it may be necessary to remove the missing values before the logical comparison is made, or unexpected results may occur. This situation is so common that R provides the subset function which accepts a data frame, matrix or vector, and a logical expression as its first two arguments, and which returns a similar object containing only those elements that meet the condition of the logical expression. It insures that missing values don”t get included, and, if its first argument is a data frame or matrix with named columns, it also resolves variable names inside the logical expression from the object passed as the first argument. A further convenience is offered by the select= argument which will extract only the specified columns from the data frame passed as the first argument. The argument to select= is a vector of integers or variable names which correspond to the columns that are to be extracted. Unlike most other functions in R, names passed through the select= argument can be either quoted or unquoted. To ignore columns, their name or index number can be preceded by a negative sign (-). Since the select= argument works by replacing variable names with their corresponding column indices, ranges of columns can be specified using variable names. The subset function will always return a new data frame, matrix or vector, so it is not suited for modifying selected parts of a data frame.

To sort the elements of an object, use the sort function. Add the decreasing=TRUE option to sort in reverse order. You can control the treatment of NA values by setting the na.last argument. To sort a data frame, you need to create a permutation of the indices from the data frame and use these to fetch the rows of the data frame in the correct order. You can generate an appropriate permutation of the indices using the order function. The order function takes a set of vectors as arguments. It sorts recursively by each vector, breaking ties by looking at successive vectors in the argument list. At the end, it returns a permutation of the indices of the vector corresponding to the sorted order.

# subsetting a list
simple <- list(a = c("fred", "sam", "harry"), b = c(24, 17, 19, 22))
mode(simple)
## [1] "list"
simple[2]
## $b
## [1] 24 17 19 22
mode(simple[2])
## [1] "list"
simple$b
## [1] 24 17 19 22
mean(simple$b)
## [1] 20.5
mean(simple[[2]])
## [1] 20.5
# single brackets return a list
simple[1]
## $a
## [1] "fred"  "sam"   "harry"
# double brackets return the actual contents of selected list element
simple[[1]]
## [1] "fred"  "sam"   "harry"
# subsetting a data frame
# First, we check the order of the columns:
data.frame(names(airquality))
##   names.airquality.
## 1             Ozone
## 2           Solar.R
## 3              Wind
## 4              Temp
## 5             Month
## 6               Day
airquality[5, 4]    # The 5th element from the 4th column,
## [1] 56
                    # i.e. the same as airquality$Temp[5]
airquality[5,]      # The 5th row of the data
##   Ozone Solar.R Wind Temp Month Day
## 5    NA      NA 14.3   56     5   5
airquality[, 4]     # The 4th column of the data, like airquality$Temp
##   [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
##  [26] 58 57 67 81 79 76 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73
##  [51] 76 77 76 76 76 75 78 73 80 77 83 84 85 81 84 83 83 88 92 92 89 82 73 81 91
##  [76] 80 81 82 84 87 85 74 81 82 86 85 82 86 88 86 83 81 81 81 82 86 85 87 89 90
## [101] 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81 86 88 97 94 96 94 91 92
## [126] 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63 70 77
## [151] 75 76 68
airquality[[4]]     # The 4th column of the data, like airquality$Temp
##   [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
##  [26] 58 57 67 81 79 76 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73
##  [51] 76 77 76 76 76 75 78 73 80 77 83 84 85 81 84 83 83 88 92 92 89 82 73 81 91
##  [76] 80 81 82 84 87 85 74 81 82 86 85 82 86 88 86 83 81 81 81 82 86 85 87 89 90
## [101] 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81 86 88 97 94 96 94 91 92
## [126] 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63 70 77
## [151] 75 76 68
airquality[, c(2, 4, 6)] # The 2nd, 4th and 6th columns of the data
##     Solar.R Temp Day
## 1       190   67   1
## 2       118   72   2
## 3       149   74   3
## 4       313   62   4
## 5        NA   56   5
## 6        NA   66   6
## 7       299   65   7
## 8        99   59   8
## 9        19   61   9
## 10      194   69  10
## 11       NA   74  11
## 12      256   69  12
## 13      290   66  13
## 14      274   68  14
## 15       65   58  15
## 16      334   64  16
## 17      307   66  17
## 18       78   57  18
## 19      322   68  19
## 20       44   62  20
## 21        8   59  21
## 22      320   73  22
## 23       25   61  23
## 24       92   61  24
## 25       66   57  25
## 26      266   58  26
## 27       NA   57  27
## 28       13   67  28
## 29      252   81  29
## 30      223   79  30
## 31      279   76  31
## 32      286   78   1
## 33      287   74   2
## 34      242   67   3
## 35      186   84   4
## 36      220   85   5
## 37      264   79   6
## 38      127   82   7
## 39      273   87   8
## 40      291   90   9
## 41      323   87  10
## 42      259   93  11
## 43      250   92  12
## 44      148   82  13
## 45      332   80  14
## 46      322   79  15
## 47      191   77  16
## 48      284   72  17
## 49       37   65  18
## 50      120   73  19
## 51      137   76  20
## 52      150   77  21
## 53       59   76  22
## 54       91   76  23
## 55      250   76  24
## 56      135   75  25
## 57      127   78  26
## 58       47   73  27
## 59       98   80  28
## 60       31   77  29
## 61      138   83  30
## 62      269   84   1
## 63      248   85   2
## 64      236   81   3
## 65      101   84   4
## 66      175   83   5
## 67      314   83   6
## 68      276   88   7
## 69      267   92   8
## 70      272   92   9
## 71      175   89  10
## 72      139   82  11
## 73      264   73  12
## 74      175   81  13
## 75      291   91  14
## 76       48   80  15
## 77      260   81  16
## 78      274   82  17
## 79      285   84  18
## 80      187   87  19
## 81      220   85  20
## 82        7   74  21
## 83      258   81  22
## 84      295   82  23
## 85      294   86  24
## 86      223   85  25
## 87       81   82  26
## 88       82   86  27
## 89      213   88  28
## 90      275   86  29
## 91      253   83  30
## 92      254   81  31
## 93       83   81   1
## 94       24   81   2
## 95       77   82   3
## 96       NA   86   4
## 97       NA   85   5
## 98       NA   87   6
## 99      255   89   7
## 100     229   90   8
## 101     207   90   9
## 102     222   92  10
## 103     137   86  11
## 104     192   86  12
## 105     273   82  13
## 106     157   80  14
## 107      64   79  15
## 108      71   77  16
## 109      51   79  17
## 110     115   76  18
## 111     244   78  19
## 112     190   78  20
## 113     259   77  21
## 114      36   72  22
## 115     255   75  23
## 116     212   79  24
## 117     238   81  25
## 118     215   86  26
## 119     153   88  27
## 120     203   97  28
## 121     225   94  29
## 122     237   96  30
## 123     188   94  31
## 124     167   91   1
## 125     197   92   2
## 126     183   93   3
## 127     189   93   4
## 128      95   87   5
## 129      92   84   6
## 130     252   80   7
## 131     220   78   8
## 132     230   75   9
## 133     259   73  10
## 134     236   81  11
## 135     259   76  12
## 136     238   77  13
## 137      24   71  14
## 138     112   71  15
## 139     237   78  16
## 140     224   67  17
## 141      27   76  18
## 142     238   68  19
## 143     201   82  20
## 144     238   64  21
## 145      14   71  22
## 146     139   81  23
## 147      49   69  24
## 148      20   63  25
## 149     193   70  26
## 150     145   77  27
## 151     191   75  28
## 152     131   76  29
## 153     223   68  30
airquality[, -2]    # All columns except the 2nd one
##     Ozone Wind Temp Month Day
## 1      41  7.4   67     5   1
## 2      36  8.0   72     5   2
## 3      12 12.6   74     5   3
## 4      18 11.5   62     5   4
## 5      NA 14.3   56     5   5
## 6      28 14.9   66     5   6
## 7      23  8.6   65     5   7
## 8      19 13.8   59     5   8
## 9       8 20.1   61     5   9
## 10     NA  8.6   69     5  10
## 11      7  6.9   74     5  11
## 12     16  9.7   69     5  12
## 13     11  9.2   66     5  13
## 14     14 10.9   68     5  14
## 15     18 13.2   58     5  15
## 16     14 11.5   64     5  16
## 17     34 12.0   66     5  17
## 18      6 18.4   57     5  18
## 19     30 11.5   68     5  19
## 20     11  9.7   62     5  20
## 21      1  9.7   59     5  21
## 22     11 16.6   73     5  22
## 23      4  9.7   61     5  23
## 24     32 12.0   61     5  24
## 25     NA 16.6   57     5  25
## 26     NA 14.9   58     5  26
## 27     NA  8.0   57     5  27
## 28     23 12.0   67     5  28
## 29     45 14.9   81     5  29
## 30    115  5.7   79     5  30
## 31     37  7.4   76     5  31
## 32     NA  8.6   78     6   1
## 33     NA  9.7   74     6   2
## 34     NA 16.1   67     6   3
## 35     NA  9.2   84     6   4
## 36     NA  8.6   85     6   5
## 37     NA 14.3   79     6   6
## 38     29  9.7   82     6   7
## 39     NA  6.9   87     6   8
## 40     71 13.8   90     6   9
## 41     39 11.5   87     6  10
## 42     NA 10.9   93     6  11
## 43     NA  9.2   92     6  12
## 44     23  8.0   82     6  13
## 45     NA 13.8   80     6  14
## 46     NA 11.5   79     6  15
## 47     21 14.9   77     6  16
## 48     37 20.7   72     6  17
## 49     20  9.2   65     6  18
## 50     12 11.5   73     6  19
## 51     13 10.3   76     6  20
## 52     NA  6.3   77     6  21
## 53     NA  1.7   76     6  22
## 54     NA  4.6   76     6  23
## 55     NA  6.3   76     6  24
## 56     NA  8.0   75     6  25
## 57     NA  8.0   78     6  26
## 58     NA 10.3   73     6  27
## 59     NA 11.5   80     6  28
## 60     NA 14.9   77     6  29
## 61     NA  8.0   83     6  30
## 62    135  4.1   84     7   1
## 63     49  9.2   85     7   2
## 64     32  9.2   81     7   3
## 65     NA 10.9   84     7   4
## 66     64  4.6   83     7   5
## 67     40 10.9   83     7   6
## 68     77  5.1   88     7   7
## 69     97  6.3   92     7   8
## 70     97  5.7   92     7   9
## 71     85  7.4   89     7  10
## 72     NA  8.6   82     7  11
## 73     10 14.3   73     7  12
## 74     27 14.9   81     7  13
## 75     NA 14.9   91     7  14
## 76      7 14.3   80     7  15
## 77     48  6.9   81     7  16
## 78     35 10.3   82     7  17
## 79     61  6.3   84     7  18
## 80     79  5.1   87     7  19
## 81     63 11.5   85     7  20
## 82     16  6.9   74     7  21
## 83     NA  9.7   81     7  22
## 84     NA 11.5   82     7  23
## 85     80  8.6   86     7  24
## 86    108  8.0   85     7  25
## 87     20  8.6   82     7  26
## 88     52 12.0   86     7  27
## 89     82  7.4   88     7  28
## 90     50  7.4   86     7  29
## 91     64  7.4   83     7  30
## 92     59  9.2   81     7  31
## 93     39  6.9   81     8   1
## 94      9 13.8   81     8   2
## 95     16  7.4   82     8   3
## 96     78  6.9   86     8   4
## 97     35  7.4   85     8   5
## 98     66  4.6   87     8   6
## 99    122  4.0   89     8   7
## 100    89 10.3   90     8   8
## 101   110  8.0   90     8   9
## 102    NA  8.6   92     8  10
## 103    NA 11.5   86     8  11
## 104    44 11.5   86     8  12
## 105    28 11.5   82     8  13
## 106    65  9.7   80     8  14
## 107    NA 11.5   79     8  15
## 108    22 10.3   77     8  16
## 109    59  6.3   79     8  17
## 110    23  7.4   76     8  18
## 111    31 10.9   78     8  19
## 112    44 10.3   78     8  20
## 113    21 15.5   77     8  21
## 114     9 14.3   72     8  22
## 115    NA 12.6   75     8  23
## 116    45  9.7   79     8  24
## 117   168  3.4   81     8  25
## 118    73  8.0   86     8  26
## 119    NA  5.7   88     8  27
## 120    76  9.7   97     8  28
## 121   118  2.3   94     8  29
## 122    84  6.3   96     8  30
## 123    85  6.3   94     8  31
## 124    96  6.9   91     9   1
## 125    78  5.1   92     9   2
## 126    73  2.8   93     9   3
## 127    91  4.6   93     9   4
## 128    47  7.4   87     9   5
## 129    32 15.5   84     9   6
## 130    20 10.9   80     9   7
## 131    23 10.3   78     9   8
## 132    21 10.9   75     9   9
## 133    24  9.7   73     9  10
## 134    44 14.9   81     9  11
## 135    21 15.5   76     9  12
## 136    28  6.3   77     9  13
## 137     9 10.9   71     9  14
## 138    13 11.5   71     9  15
## 139    46  6.9   78     9  16
## 140    18 13.8   67     9  17
## 141    13 10.3   76     9  18
## 142    24 10.3   68     9  19
## 143    16  8.0   82     9  20
## 144    13 12.6   64     9  21
## 145    23  9.2   71     9  22
## 146    36 10.3   81     9  23
## 147     7 10.3   69     9  24
## 148    14 16.6   63     9  25
## 149    30  6.9   70     9  26
## 150    NA 13.2   77     9  27
## 151    14 14.3   75     9  28
## 152    18  8.0   76     9  29
## 153    20 11.5   68     9  30
airquality[, c("Temp", "Wind")] # The Temp and Wind columns
##     Temp Wind
## 1     67  7.4
## 2     72  8.0
## 3     74 12.6
## 4     62 11.5
## 5     56 14.3
## 6     66 14.9
## 7     65  8.6
## 8     59 13.8
## 9     61 20.1
## 10    69  8.6
## 11    74  6.9
## 12    69  9.7
## 13    66  9.2
## 14    68 10.9
## 15    58 13.2
## 16    64 11.5
## 17    66 12.0
## 18    57 18.4
## 19    68 11.5
## 20    62  9.7
## 21    59  9.7
## 22    73 16.6
## 23    61  9.7
## 24    61 12.0
## 25    57 16.6
## 26    58 14.9
## 27    57  8.0
## 28    67 12.0
## 29    81 14.9
## 30    79  5.7
## 31    76  7.4
## 32    78  8.6
## 33    74  9.7
## 34    67 16.1
## 35    84  9.2
## 36    85  8.6
## 37    79 14.3
## 38    82  9.7
## 39    87  6.9
## 40    90 13.8
## 41    87 11.5
## 42    93 10.9
## 43    92  9.2
## 44    82  8.0
## 45    80 13.8
## 46    79 11.5
## 47    77 14.9
## 48    72 20.7
## 49    65  9.2
## 50    73 11.5
## 51    76 10.3
## 52    77  6.3
## 53    76  1.7
## 54    76  4.6
## 55    76  6.3
## 56    75  8.0
## 57    78  8.0
## 58    73 10.3
## 59    80 11.5
## 60    77 14.9
## 61    83  8.0
## 62    84  4.1
## 63    85  9.2
## 64    81  9.2
## 65    84 10.9
## 66    83  4.6
## 67    83 10.9
## 68    88  5.1
## 69    92  6.3
## 70    92  5.7
## 71    89  7.4
## 72    82  8.6
## 73    73 14.3
## 74    81 14.9
## 75    91 14.9
## 76    80 14.3
## 77    81  6.9
## 78    82 10.3
## 79    84  6.3
## 80    87  5.1
## 81    85 11.5
## 82    74  6.9
## 83    81  9.7
## 84    82 11.5
## 85    86  8.6
## 86    85  8.0
## 87    82  8.6
## 88    86 12.0
## 89    88  7.4
## 90    86  7.4
## 91    83  7.4
## 92    81  9.2
## 93    81  6.9
## 94    81 13.8
## 95    82  7.4
## 96    86  6.9
## 97    85  7.4
## 98    87  4.6
## 99    89  4.0
## 100   90 10.3
## 101   90  8.0
## 102   92  8.6
## 103   86 11.5
## 104   86 11.5
## 105   82 11.5
## 106   80  9.7
## 107   79 11.5
## 108   77 10.3
## 109   79  6.3
## 110   76  7.4
## 111   78 10.9
## 112   78 10.3
## 113   77 15.5
## 114   72 14.3
## 115   75 12.6
## 116   79  9.7
## 117   81  3.4
## 118   86  8.0
## 119   88  5.7
## 120   97  9.7
## 121   94  2.3
## 122   96  6.3
## 123   94  6.3
## 124   91  6.9
## 125   92  5.1
## 126   93  2.8
## 127   93  4.6
## 128   87  7.4
## 129   84 15.5
## 130   80 10.9
## 131   78 10.3
## 132   75 10.9
## 133   73  9.7
## 134   81 14.9
## 135   76 15.5
## 136   77  6.3
## 137   71 10.9
## 138   71 11.5
## 139   78  6.9
## 140   67 13.8
## 141   76 10.3
## 142   68 10.3
## 143   82  8.0
## 144   64 12.6
## 145   71  9.2
## 146   81 10.3
## 147   69 10.3
## 148   63 16.6
## 149   70  6.9
## 150   77 13.2
## 151   75 14.3
## 152   76  8.0
## 153   68 11.5
age <- c(28, 48, 47, 71, 22, 80, 48, 30, 31)
purchase <- c(20, 59, 2, 12, 22, 160, 34, 34, 29)
bookstore <- data.frame(age, purchase)

bookstore$age[2] <- 18
# or
bookstore[2, 1] <- 18

# subsetting with logical values
nums = c(12,9,8,14,7,16,3,2,9)
nums > 10
## [1]  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE
nums[nums > 10]
## [1] 12 14 16
which(nums > 10)
## [1] 1 4 6
# equal to
seq(along = nums)[nums > 10]
## [1] 1 4 6
which.max(airquality$Temp)
## [1] 120
airquality[which.max(airquality$Temp),]
##     Ozone Solar.R Wind Temp Month Day
## 120    76     203  9.7   97     8  28
airquality[airquality$Temp > 90, ]
##     Ozone Solar.R Wind Temp Month Day
## 42     NA     259 10.9   93     6  11
## 43     NA     250  9.2   92     6  12
## 69     97     267  6.3   92     7   8
## 70     97     272  5.7   92     7   9
## 75     NA     291 14.9   91     7  14
## 102    NA     222  8.6   92     8  10
## 120    76     203  9.7   97     8  28
## 121   118     225  2.3   94     8  29
## 122    84     237  6.3   96     8  30
## 123    85     188  6.3   94     8  31
## 124    96     167  6.9   91     9   1
## 125    78     197  5.1   92     9   2
## 126    73     183  2.8   93     9   3
## 127    91     189  4.6   93     9   4
# knowing if all elements in a vector fulfill the condition
all(airquality$Temp > 90)
## [1] FALSE
# knowing whether at least one element in a vector fulfill the condition
any(airquality$Temp > 90)
## [1] TRUE
# finding how many elements that fulfill a condition
sum(airquality$Temp > 90)
## [1] 14
# modifying elements through logical subscriptions
nums[nums > 10] <- 0
nums
## [1] 0 9 8 0 7 0 3 2 9
dd <- data.frame(a = c(5, 9, 12, 15, 17, 11), b = c(8, NA, 12, 10, NA, 15))

dd[dd$b > 10, ]
##       a  b
## NA   NA NA
## 3    12 12
## NA.1 NA NA
## 6    11 15
bookstore$visit_length <- c(5, 2, 20, 22, 12, 31, 9, 10, 11)
bookstore
##   age purchase visit_length
## 1  28       20            5
## 2  18       59            2
## 3  47        2           20
## 4  71       12           22
## 5  22       22           12
## 6  80      160           31
## 7  48       34            9
## 8  30       34           10
## 9  31       29           11
# storing the TRUE or FALSE values in a new variable
airquality$Hot <- airquality$Temp > 90
airquality$Hot
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE
##  [73] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# filtering and creating a new variable
temp_may <- airquality$Temp[airquality$Month == 5]
temp_june <- airquality$Temp[airquality$Month == 6]

# splitting vectors into lists
temps <- split(airquality$Temp, airquality$Month)
names(temps) <- c("May", "June", "July", "August", "September")
temps
## $May
##  [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
## [26] 58 57 67 81 79 76
## 
## $June
##  [1] 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73 76 77 76 76 76 75
## [26] 78 73 80 77 83
## 
## $July
##  [1] 84 85 81 84 83 83 88 92 92 89 82 73 81 91 80 81 82 84 87 85 74 81 82 86 85
## [26] 82 86 88 86 83 81
## 
## $August
##  [1] 81 81 82 86 85 87 89 90 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81
## [26] 86 88 97 94 96 94
## 
## $September
##  [1] 91 92 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63
## [26] 70 77 75 76 68
temps$June
##  [1] 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73 76 77 76 76 76 75
## [26] 78 73 80 77 83
# collapsing lists into vectors
unlist(temps)
##        May1        May2        May3        May4        May5        May6 
##          67          72          74          62          56          66 
##        May7        May8        May9       May10       May11       May12 
##          65          59          61          69          74          69 
##       May13       May14       May15       May16       May17       May18 
##          66          68          58          64          66          57 
##       May19       May20       May21       May22       May23       May24 
##          68          62          59          73          61          61 
##       May25       May26       May27       May28       May29       May30 
##          57          58          57          67          81          79 
##       May31       June1       June2       June3       June4       June5 
##          76          78          74          67          84          85 
##       June6       June7       June8       June9      June10      June11 
##          79          82          87          90          87          93 
##      June12      June13      June14      June15      June16      June17 
##          92          82          80          79          77          72 
##      June18      June19      June20      June21      June22      June23 
##          65          73          76          77          76          76 
##      June24      June25      June26      June27      June28      June29 
##          76          75          78          73          80          77 
##      June30       July1       July2       July3       July4       July5 
##          83          84          85          81          84          83 
##       July6       July7       July8       July9      July10      July11 
##          83          88          92          92          89          82 
##      July12      July13      July14      July15      July16      July17 
##          73          81          91          80          81          82 
##      July18      July19      July20      July21      July22      July23 
##          84          87          85          74          81          82 
##      July24      July25      July26      July27      July28      July29 
##          86          85          82          86          88          86 
##      July30      July31     August1     August2     August3     August4 
##          83          81          81          81          82          86 
##     August5     August6     August7     August8     August9    August10 
##          85          87          89          90          90          92 
##    August11    August12    August13    August14    August15    August16 
##          86          86          82          80          79          77 
##    August17    August18    August19    August20    August21    August22 
##          79          76          78          78          77          72 
##    August23    August24    August25    August26    August27    August28 
##          75          79          81          86          88          97 
##    August29    August30    August31  September1  September2  September3 
##          94          96          94          91          92          93 
##  September4  September5  September6  September7  September8  September9 
##          93          87          84          80          78          75 
## September10 September11 September12 September13 September14 September15 
##          73          81          76          77          71          71 
## September16 September17 September18 September19 September20 September21 
##          78          67          76          68          82          64 
## September22 September23 September24 September25 September26 September27 
##          71          81          69          63          70          77 
## September28 September29 September30 
##          75          76          68
# using subset
subset(dd, b > 10)
##    a  b
## 3 12 12
## 6 11 15
some <- subset(LifeCycleSavings, sr > 10, select = c(pop15, pop75))
head(some)
##            pop15 pop75
## Australia  29.35  2.87
## Austria    23.32  4.41
## Belgium    23.80  4.43
## Brazil     42.19  0.83
## China      44.75  0.67
## Costa Rica 47.64  1.14
life1 <- subset(LifeCycleSavings, select = pop15:dpi)
# or
life1 <- subset(LifeCycleSavings, select = 1:3)
head(life1)
##              sr pop15 pop75
## Australia 11.43 29.35  2.87
## Austria   12.07 23.32  4.41
## Belgium   13.17 23.80  4.43
## Bolivia    5.75 41.89  1.67
## Brazil    12.88 42.19  0.83
## Canada     8.79 31.72  2.85
life2 <- subset(LifeCycleSavings, select = c(-pop15, -pop75))
# or
life2 <- subset(LifeCycleSavings, select = -c(2, 3))
head(life2)
##              sr     dpi ddpi
## Australia 11.43 2329.68 2.87
## Austria   12.07 1507.99 3.93
## Belgium   13.17 2108.47 3.82
## Bolivia    5.75  189.13 0.22
## Brazil    12.88  728.47 4.56
## Canada     8.79 2982.88 2.43
# sorting with sort()
w <- c(5, 4, 7, 2, 7, 1)
sort(w)
## [1] 1 2 4 5 7 7
sort(w, decreasing = TRUE)
## [1] 7 7 5 4 2 1
length(w) <- 7
sort(w, na.last = TRUE)
## [1]  1  2  4  5  7  7 NA
sort(w, na.last = FALSE)
## [1] NA  1  2  4  5  7  7
# using order() to sort a data frame
v <- c(11, 12, 13, 15, 14)
order(v)
## [1] 1 2 3 5 4
v[order(v)]
## [1] 11 12 13 14 15
u <- c("pig", "cow", "duck", "horse", "rat")
w <- data.frame(v, u)
w
##    v     u
## 1 11   pig
## 2 12   cow
## 3 13  duck
## 4 15 horse
## 5 14   rat
w[order(w$v), ] 
##    v     u
## 1 11   pig
## 2 12   cow
## 3 13  duck
## 5 14   rat
## 4 15 horse
snowdata[order(snowdata$SnowInches), ]
##       Winter SnowInches SnowMeters
## 72 2011-2012        9.3    0.23622
## 33 1972-1973       10.3    0.26162
## 40 1979-1980       12.5    0.31750
## 55 1994-1995       14.9    0.37846
## 62 2001-2002       15.1    0.38354
## 49 1988-1989       15.5    0.39370
## 67 2006-2007       17.1    0.43434
## 46 1985-1986       18.1    0.45974
## 51 1990-1991       19.1    0.48514
## 7  1946-1947       19.4    0.49276
## 52 1991-1992       22.0    0.55880
## 41 1980-1981       22.3    0.56642
## 14 1953-1954       23.6    0.59944
## 2  1941-1942       23.9    0.60706
## 60 1999-2000       24.4    0.61976
## 15 1954-1955       25.1    0.63754
## 58 1997-1998       25.6    0.65024
## 45 1984-1985       26.6    0.67564
## 39 1978-1979       27.5    0.69850
## 35 1974-1975       27.6    0.70104
## 4  1943-1944       27.7    0.70358
## 11 1950-1951       29.7    0.75438
## 13 1952-1953       29.8    0.75692
## 23 1962-1963       30.9    0.78486
## 12 1951-1952       31.9    0.81026
## 10 1949-1950       32.0    0.81280
## 43 1982-1983       32.7    0.83058
## 19 1958-1959       34.1    0.86614
## 70 2009-2010       35.7    0.90678
## 76 2015-2016       36.2    0.91948
## 59 1998-1999       36.4    0.92456
## 34 1973-1974       36.9    0.93726
## 9  1948-1949       37.1    0.94234
## 50 1989-1990       39.2    0.99568
## 64 2003-2004       39.4    1.00076
## 66 2005-2006       39.9    1.01346
## 20 1959-1960       40.9    1.03886
## 47 1986-1987       42.5    1.07950
## 44 1983-1984       43.0    1.09220
## 26 1965-1966       44.1    1.12014
## 18 1957-1958       44.7    1.13538
## 22 1961-1962       44.7    1.13538
## 28 1967-1968       44.8    1.13792
## 3  1942-1943       45.7    1.16078
## 61 2000-2001       45.9    1.16586
## 36 1975-1976       46.6    1.18364
## 32 1971-1972       47.5    1.20650
## 1  1940-1941       47.8    1.21412
## 30 1969-1970       48.8    1.23952
## 25 1964-1965       50.4    1.28016
## 6  1945-1946       50.8    1.29032
## 68 2007-2008       51.2    1.30048
## 57 1996-1997       51.9    1.31826
## 17 1956-1957       52.0    1.32080
## 48 1987-1988       52.6    1.33604
## 29 1968-1969       53.8    1.36652
## 31 1970-1971       57.3    1.45542
## 37 1976-1977       58.5    1.48590
## 74 2013-2014       58.9    1.49606
## 5  1944-1945       59.2    1.50368
## 27 1966-1967       60.1    1.52654
## 16 1955-1956       60.9    1.54686
## 21 1960-1961       61.5    1.56210
## 42 1981-1982       61.8    1.56972
## 24 1963-1964       63.0    1.60020
## 73 2012-2013       63.4    1.61036
## 69 2008-2009       65.9    1.67386
## 63 2002-2003       70.9    1.80086
## 71 2010-2011       81.0    2.05740
## 53 1992-1993       83.9    2.13106
## 38 1977-1978       85.1    2.16154
## 65 2004-2005       86.6    2.19964
## 8  1947-1948       89.2    2.26568
## 54 1993-1994       96.3    2.44602
## 56 1995-1996      107.6    2.73304
## 75 2014-2015      110.6    2.80924
snowdata[order(snowdata$SnowInches, snowdata$SnowMeters), ]
##       Winter SnowInches SnowMeters
## 72 2011-2012        9.3    0.23622
## 33 1972-1973       10.3    0.26162
## 40 1979-1980       12.5    0.31750
## 55 1994-1995       14.9    0.37846
## 62 2001-2002       15.1    0.38354
## 49 1988-1989       15.5    0.39370
## 67 2006-2007       17.1    0.43434
## 46 1985-1986       18.1    0.45974
## 51 1990-1991       19.1    0.48514
## 7  1946-1947       19.4    0.49276
## 52 1991-1992       22.0    0.55880
## 41 1980-1981       22.3    0.56642
## 14 1953-1954       23.6    0.59944
## 2  1941-1942       23.9    0.60706
## 60 1999-2000       24.4    0.61976
## 15 1954-1955       25.1    0.63754
## 58 1997-1998       25.6    0.65024
## 45 1984-1985       26.6    0.67564
## 39 1978-1979       27.5    0.69850
## 35 1974-1975       27.6    0.70104
## 4  1943-1944       27.7    0.70358
## 11 1950-1951       29.7    0.75438
## 13 1952-1953       29.8    0.75692
## 23 1962-1963       30.9    0.78486
## 12 1951-1952       31.9    0.81026
## 10 1949-1950       32.0    0.81280
## 43 1982-1983       32.7    0.83058
## 19 1958-1959       34.1    0.86614
## 70 2009-2010       35.7    0.90678
## 76 2015-2016       36.2    0.91948
## 59 1998-1999       36.4    0.92456
## 34 1973-1974       36.9    0.93726
## 9  1948-1949       37.1    0.94234
## 50 1989-1990       39.2    0.99568
## 64 2003-2004       39.4    1.00076
## 66 2005-2006       39.9    1.01346
## 20 1959-1960       40.9    1.03886
## 47 1986-1987       42.5    1.07950
## 44 1983-1984       43.0    1.09220
## 26 1965-1966       44.1    1.12014
## 18 1957-1958       44.7    1.13538
## 22 1961-1962       44.7    1.13538
## 28 1967-1968       44.8    1.13792
## 3  1942-1943       45.7    1.16078
## 61 2000-2001       45.9    1.16586
## 36 1975-1976       46.6    1.18364
## 32 1971-1972       47.5    1.20650
## 1  1940-1941       47.8    1.21412
## 30 1969-1970       48.8    1.23952
## 25 1964-1965       50.4    1.28016
## 6  1945-1946       50.8    1.29032
## 68 2007-2008       51.2    1.30048
## 57 1996-1997       51.9    1.31826
## 17 1956-1957       52.0    1.32080
## 48 1987-1988       52.6    1.33604
## 29 1968-1969       53.8    1.36652
## 31 1970-1971       57.3    1.45542
## 37 1976-1977       58.5    1.48590
## 74 2013-2014       58.9    1.49606
## 5  1944-1945       59.2    1.50368
## 27 1966-1967       60.1    1.52654
## 16 1955-1956       60.9    1.54686
## 21 1960-1961       61.5    1.56210
## 42 1981-1982       61.8    1.56972
## 24 1963-1964       63.0    1.60020
## 73 2012-2013       63.4    1.61036
## 69 2008-2009       65.9    1.67386
## 63 2002-2003       70.9    1.80086
## 71 2010-2011       81.0    2.05740
## 53 1992-1993       83.9    2.13106
## 38 1977-1978       85.1    2.16154
## 65 2004-2005       86.6    2.19964
## 8  1947-1948       89.2    2.26568
## 54 1993-1994       96.3    2.44602
## 56 1995-1996      107.6    2.73304
## 75 2014-2015      110.6    2.80924
# getting the location of the maximum and minimum values
which.max(snowdata$Boston)
## integer(0)
which.min(snowdata$Boston)
## integer(0)
# using indexes on the rows and/or columns
rows <- Arthritis$Sex == "Female" & Arthritis$Age > 68
cols <- c("Treatment", "Improved")
Arthritis[rows, cols]
##    Treatment Improved
## 39   Treated     None
## 40   Treated     Some
## 41   Treated     Some
## 84   Placebo   Marked
# using names of values of a column to subset and create a new variable (no data)
# btw9s[c("BB", "BE", "MV","SN","ST","TH"), "EW"] <- "East"

# creating new variables based on others and reordering (no data)
# Fleiss93 <- within(Fleiss93, {
#   total <- n.e + n.c # create new var based on the sum of two
#   st1 <- as.character(study) # change type
#   st <- reorder(study, -(total)) # reorder var
# })

# fixing the naming (no data)
# exp1_long$condition <- ifelse(exp1_long$condition == "no", "No_communication",
#   ifelse(exp1_long$condition == "go", "High_confidence",
#     ifelse(exp1_long$condition == "me", "Medium_confidence",
#       ifelse(exp1_long$condition == "ba", "Low_confidence",
#         exp1_long$condition
#       )
#     )
#   )
# )

# renaming variables and convert to numeric (no data)
# exp1_long$temperature <- as.numeric(ifelse(exp1_long$temperature == "315", "31.5",
#   ifelse(exp1_long$temperature == "325", "32.5",
#     ifelse(exp1_long$temperature == "335", "33.5",
#       ifelse(exp1_long$temperature == "345", "34.5",
#         ifelse(exp1_long$temperature == "355", "35.5",
#           ifelse(exp1_long$temperature == "365", "36.5",
#             exp1_long$temperature
#           )
#         )
#       )
#     )
#   )
# ))

# recoding Yes/No responses as numeric (Yes=1, No=0) (no data)
# exp1_long$response_code <- ifelse(exp1_long$response == "Yes", 1, 0)

# adding values by row and getting rid of all NAs (no data)
# df.a$expertise_sum <- rowSums(df.a[, 12:19], na.rm = TRUE)

# cutting numeric variables into categories
age <- c(60, 58, 24, 26, 34, 42, 31, 30, 33, 2, 9)
age.breaks <- seq(from = 0, to = 60, by = 20)
age.labels <- c("young", "adult", "older")
age.group <- cut(x = age, breaks = age.breaks, labels = age.labels)
age.df <- data.frame(age, age.group)
age.df
##    age age.group
## 1   60     older
## 2   58     older
## 3   24     adult
## 4   26     adult
## 5   34     adult
## 6   42     older
## 7   31     adult
## 8   30     adult
## 9   33     adult
## 10   2     young
## 11   9     young
# creating a temporal seq of years and calculate mean (no data)
# yrs <- c(seq(1972, 1988, 4), 1993, seq(1996, 2016, 4)) # seq every 4 years
# Calculating mean for every year 
# mean_age <- gss_lon %>%
#   filter(age %nin% NA && year %in% yrs) %>%
#   group_by(year) %>%
#   summarize(xbar = round(mean(age, na.rm = TRUE), 0))

# splitting a vector by group (no data)
# using split function: x = the variable that needs to be split into groups; y = the grouping variable
# speech.by.char <- split(x = utterance, y = speaker)
# speech.by.char

# pull variables from a list or data frame out and into the workspace
# importList(speech.by.char)

# locating NAs in a data frame
# pos_country <- which(is.na(df$Region))

# replacing NAs in a data frame for the last value with na.locf() (no data)
# replacing NAs for the last value
# df <- mutate(df, Region = zoo::na.locf(Region))

6.2 Tidyverse

filter() picks rows based on data values, but select() chooses columns based on the column names (not values of data within the columns).

Select certain rows based on a logical condition - dplyr´s filter(). To check for one condition OR another condition, use the | symbol, which means or. You can get the number of rows in a data frame with nrow(). To filter by one condition AND a second condition, you can use the & sign. Select certain rows based on row number - dplyr”s slice(). If we want a way to automatically check a condition across multiple columns, we need functions that are basically a derivative of the across function: if_any and if_all. What both functions have in common is that they produce TRUE or FALSE values. When working with filter, these logical values are usually generated by checking a condition for a single column. The trick with if_any and if_all is that they check a condition across multiple columns and return a TRUE or FALSE value for each row.

  • if_any indicates whether one of the selected columns fulfills a condition
  • if_all indicates whether all selected columns satisfy a condition

The following structure applies to both if_any and if_all:

%>% filter( if_any( .cols = , .fns = , ) )

Both functions were introduced in dplyr in February 2021. One reason for their introduction is that across was not feasible with filter. Similar to across, we can use tidyselect functions to select columns for which we want to check the condition.

Another very useful use case is filtering rows based on missing values across multiple columns. Besides that, we can combine case_when with if_any / if_all to create a new column based on multiple column-spanning conditions. This works because the left side of the two-sided case_when formulas expects a logical value ( == ).

Both slicing and filtering allow you to remove or keep rows in a data frame. Essentially, you can use both to achieve the same result, but their approaches differ. While filter works with conditions (e.g. displ > 17), slice works with indices. slice keeps all rows for which you specify positive indices. Note that in R indexing starts with 1 and not with 0 as in most other programming languages. You can also provide a vector of indices instead of the comma-separated indices in the slice function. To remove specific rows, we can use negative indices. A common use case within the slice family is to slice rows that have the highest or lowest value within a column. Finding these rows with filter would be tedious. A much easier way to achieve the same result is to use slice_max(). For the first argument order_by you specify the column for which the highest values should be taken. With n you specify how many of the rows with the highest values you want to keep. If you are more interested in the percentage of rows with the highest value, you can use the argument prop. Similarly, you can keep the rows with the lowest values in a given column with slice_min(). The slice functions become especially powerful when combined with group_by. The trick is that any function called after group_by is only applied to the subgroups. Another useful function is slice_sample. It randomly selects rows from your data frame. You define how many should be selected. slice_sample by default samples without replacement. Once we have selected a row, we cannot select it again. Consequently, there will be no duplicate rows in our data frame. However, if we set the replace argument to TRUE, we will perform sampling with replacement. We can find the duplicate rows with the function get_dupes from the janitor package.

This functionality allows us to create bootstraps from our data frame. Bootstrapping is a technique where a set of samples of the same size are drawn from a single original sample.For example, if you have a vector c(1, 4, 5, 6), you can create the following bootstraps from this vector: c(1, 4, 4, 6), c(1, 1, 1, 1) or c(5, 5, 1, 6). Some values appear more than once because bootstrapping allows each value to be pulled multiple times from the original data set. Once you have your bootstraps, you can calculate metrics from them. For example, the mean value of each bootstrap. The underlying logic of this technique is that since the sample itself is from a population, the bootstraps act as proxies for other samples from that population. We can create many bootstrap from our sample. frames. Once we have the bootstraps, we can calculate any metric from them. Usually one calculates confidence intervals, standard deviations, but also measures of center like the mean from the bootstraps.

Use arrange(dataframe, colname) to sort in ascending order and arrange(dataframe, desc(colname)) to sort in descending order. To sort by a second column in case there are ties in the first column, the syntax is arrange(dataframe, col1, col2).

You can select by specific column name, no quotes or c() needed: select(snowdata, Winter, Boston). Select a contiguous group of columns, such as starting with Boston and ending with New York City, with the syntax select(snowdata, Boston:NYC). You can select based on column names containing certain characters; for example, if you had a data frame with column names in the format city_state such as Boston_MA, Chicago_IL, NYC_NY, Fargo_ND and Syracuse_NY, you could select all the New York State entries using select(dataframe, contains("_NY")) or select(dataframe, ends_with("_NY"). You can delete columns by putting a minus sign before your selection, such as select(snowdata, -(Boston:Chicago)) or select(dataframe, -contains("_NY")). select_if() lets you use is. functions such as is.numeric() or is.character() to choose columns by data type.

The tidyselect functions give you many options to select columns in R. The first function is everything. As the name suggests, it lets you select all columns of a data frame. Then we have last_col. With this function you can select the last column in a data frame. We have the two functions starts_with and ends_with. You use these functions when you want to select columns that start or end with exactly a certain string. starts_with and ends_with works with any character, but also with a vector of characters.

Next we have the contains function. contains searches for columns that contain a specific string. Note that it does not work with regular expressions, but searches for exactly the string you specify. By default, however, the function is not case-sensitive. It doesn´t matter if your columns are in uppercase or lowercase. If you are concerned about case sensitivity, set the ignore.case argument to FALSE (this also works with starts_with, ends_with, and matches). Unlike contains, matches works with regular expressions.

The function num_range is useful if your column names follow a certain pattern.

Finally, there is the where function. where is used when you want to select variables of a certain data type. Other predicate functions are:

  • is.double
  • is.logical
  • is.factor
  • is.integer

You can combine the different selection functions with the & and | operators.

With mutate you normally specify the new column to be created or overwritten. With across you don’t have to do that. Without specifying the .names argument, the names of your columns remain the same, you just apply the function(s) to those columns.

When we impute missing values, we replace them with substituted values.

snowdata <- rio::import("input/BostonChicagoNYCSnowfalls.csv")
# knowing how may rows
snowdata[nrow(snowdata), ]
##       Winter Boston Chicago  NYC
## 76 2015-2016   36.2    31.2 32.1
# using filter from dplyr
filter(snowdata, Boston > 100)
##      Winter Boston Chicago  NYC
## 1 1995-1996  107.6    23.9 75.6
## 2 2014-2015  110.6    50.7 50.3
# OR
filter(snowdata, Boston < 20 | Boston > 100)
##       Winter Boston Chicago  NYC
## 1  1946-1947   19.4    34.1 30.6
## 2  1972-1973   10.3    32.9  2.8
## 3  1979-1980   12.5    42.4 12.8
## 4  1985-1986   18.1    29.0 13.0
## 5  1988-1989   15.5    24.5  8.1
## 6  1990-1991   19.1    36.7 24.9
## 7  1994-1995   14.9    24.1 11.8
## 8  1995-1996  107.6    23.9 75.6
## 9  2001-2002   15.1    31.1  3.5
## 10 2006-2007   17.1    35.6 12.4
## 11 2011-2012    9.3    19.8  7.4
## 12 2014-2015  110.6    50.7 50.3
# AND
filter(snowdata, Boston > 40 & Boston < 50)
##       Winter Boston Chicago  NYC
## 1  1940-1941   47.8    52.5 39.0
## 2  1942-1943   45.7    45.2 29.5
## 3  1957-1958   44.7    20.0 44.7
## 4  1959-1960   40.9    50.9 39.2
## 5  1961-1962   44.7    58.9 18.1
## 6  1965-1966   44.1    24.9 21.4
## 7  1967-1968   44.8    28.4 19.5
## 8  1969-1970   48.8    77.0 25.6
## 9  1971-1972   47.5    46.8 22.9
## 10 1975-1976   46.6    43.3 17.3
## 11 1983-1984   43.0    49.0 25.4
## 12 1986-1987   42.5    26.2 23.1
## 13 2000-2001   45.9    39.2 35.0
filter(snowdata, Boston > 50, Boston < 80)
##       Winter Boston Chicago  NYC
## 1  1944-1945   59.2    34.9 27.1
## 2  1945-1946   50.8    23.9 31.4
## 3  1955-1956   60.9    26.3 33.5
## 4  1956-1957   52.0    31.3 21.9
## 5  1960-1961   61.5    40.7 54.7
## 6  1963-1964   63.0    35.2 44.7
## 7  1964-1965   50.4    59.5 24.4
## 8  1966-1967   60.1    68.4 51.5
## 9  1968-1969   53.8    29.4 30.2
## 10 1970-1971   57.3    37.9 15.5
## 11 1976-1977   58.5    54.1 24.5
## 12 1981-1982   61.8    59.3 24.6
## 13 1987-1988   52.6    42.6 19.1
## 14 1996-1997   51.9    40.6 10.0
## 15 2002-2003   70.9    28.6 49.3
## 16 2007-2008   51.2    60.3 11.9
## 17 2008-2009   65.9    52.7 27.6
## 18 2012-2013   63.4    30.1 26.1
## 19 2013-2014   58.9    82.0 57.4
# how to filter rows based on a condition across mulitple columns
# getting the data
df <- tibble(
  a = c(1, 2, 3),
  b = c(NA, 4, 8),
  c = c(1, 4, 1)
)
# filtering all rows that contain missing values (wrong version!)
df %>%
  filter(!is.na(a) & !is.na(b) & !is.na(c))
## # A tibble: 2 × 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     2     4     4
## 2     3     8     1
# with if_any()
billboard %>%
  filter(
    if_any(
      .cols = contains("wk"),
      .fns = ~ . == 1
    )
  )
## # A tibble: 17 × 79
##    artist track date.ent…¹   wk1   wk2   wk3   wk4   wk5   wk6   wk7   wk8   wk9
##    <chr>  <chr> <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 Aaliy… Try … 2000-03-18    59    53    38    28    21    18    16    14    12
##  2 Aguil… Come… 2000-08-05    57    47    45    29    23    18    11     9     9
##  3 Aguil… What… 1999-11-27    71    51    28    18    13    13    11     1     1
##  4 Carey… Than… 1999-12-11    82    68    50    50    41    37    26    22    22
##  5 Creed  With… 2000-05-13    84    78    76    74    70    68    74    75    69
##  6 Desti… Inde… 2000-09-23    78    63    49    33    23    15     7     5     1
##  7 Desti… Say … 1999-12-25    83    83    44    38    16    13    16    16    16
##  8 Igles… Be W… 2000-04-01    63    45    34    23    17    12     9     8     8
##  9 Janet  Does… 2000-06-17    59    52    43    30    29    22    15    10    10
## 10 Lones… Amaz… 1999-06-05    81    54    44    39    38    33    29    29    32
## 11 Madon… Music 2000-08-12    41    23    18    14     2     1     1     1     1
## 12 N'Sync It's… 2000-05-06    82    70    51    39    26    19    15     9     7
## 13 Santa… Mari… 2000-02-12    15     8     6     5     2     3     2     2     1
## 14 Savag… I Kn… 1999-10-23    71    48    43    31    20    13     7     6     4
## 15 Sisqo  Inco… 2000-06-24    77    66    61    61    61    55     2     1     1
## 16 Verti… Ever… 2000-01-22    70    61    53    46    40    33    31    26    22
## 17 match… Bent  2000-04-29    60    37    29    24    22    21    18    16    13
## # … with 67 more variables: wk10 <dbl>, wk11 <dbl>, wk12 <dbl>, wk13 <dbl>,
## #   wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>, wk19 <dbl>,
## #   wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>, wk24 <dbl>, wk25 <dbl>,
## #   wk26 <dbl>, wk27 <dbl>, wk28 <dbl>, wk29 <dbl>, wk30 <dbl>, wk31 <dbl>,
## #   wk32 <dbl>, wk33 <dbl>, wk34 <dbl>, wk35 <dbl>, wk36 <dbl>, wk37 <dbl>,
## #   wk38 <dbl>, wk39 <dbl>, wk40 <dbl>, wk41 <dbl>, wk42 <dbl>, wk43 <dbl>,
## #   wk44 <dbl>, wk45 <dbl>, wk46 <dbl>, wk47 <dbl>, wk48 <dbl>, wk49 <dbl>, …
# with if_all()
billboard %>%
  filter(
    if_all(
      .cols = matches("wk[1-5]$"),
      .fns = ~ . <= 50
    )
  )
## # A tibble: 13 × 79
##    artist track date.ent…¹   wk1   wk2   wk3   wk4   wk5   wk6   wk7   wk8   wk9
##    <chr>  <chr> <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 "Agui… I Tu… 2000-04-15    50    39    30    28    21    19    20    17    17
##  2 "Back… Shap… 2000-10-14    39    25    24    15    12    12    10     9    10
##  3 "Dixi… Good… 2000-03-18    40    29    24    24    20    20    20    19    38
##  4 "Elli… Hot … 1999-11-27    36    21    13     9     7     7     5     7     7
##  5 "Guy"  Danc… 1999-12-18    46    29    19    22    36    44    58    58    68
##  6 "Lil … Boun… 2000-08-19    48    35    24    24    20    20    20    20    22
##  7 "Mado… Amer… 2000-02-19    43    35    29    29    33    32    40    58    88
##  8 "Mado… Music 2000-08-12    41    23    18    14     2     1     1     1     1
##  9 "Mart… She … 2000-10-07    38    28    21    21    18    16    13    13    12
## 10 "N'Sy… Bye … 2000-01-29    42    20    19    14    13     7     6     5     5
## 11 "No D… Simp… 2000-07-01    50    40    39    38    38    48    52    55    80
## 12 "Pink" Ther… 2000-03-04    25    15    12    11    11     7     7    12    14
## 13 "Sant… Mari… 2000-02-12    15     8     6     5     2     3     2     2     1
## # … with 67 more variables: wk10 <dbl>, wk11 <dbl>, wk12 <dbl>, wk13 <dbl>,
## #   wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>, wk19 <dbl>,
## #   wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>, wk24 <dbl>, wk25 <dbl>,
## #   wk26 <dbl>, wk27 <dbl>, wk28 <dbl>, wk29 <dbl>, wk30 <dbl>, wk31 <dbl>,
## #   wk32 <dbl>, wk33 <dbl>, wk34 <dbl>, wk35 <dbl>, wk36 <dbl>, wk37 <dbl>,
## #   wk38 <dbl>, wk39 <dbl>, wk40 <dbl>, wk41 <dbl>, wk42 <dbl>, wk43 <dbl>,
## #   wk44 <dbl>, wk45 <dbl>, wk46 <dbl>, wk47 <dbl>, wk48 <dbl>, wk49 <dbl>, …
# how to filter rows that contain missing values
# This data frame comes from the tidyr documentation:
# https://tidyr.tidyverse.org/reference/complete.html
(df <- tibble(
  item_name = c("a", "a", "b", "b"),
  group = c(1, NA, 1, 2),
  value1 = c(1, NA, 3, 4),
  value2 = c(4, 5, NA, 7)
))
## # A tibble: 4 × 4
##   item_name group value1 value2
##   <chr>     <dbl>  <dbl>  <dbl>
## 1 a             1      1      4
## 2 a            NA     NA      5
## 3 b             1      3     NA
## 4 b             2      4      7
# keeping all rows whose numeric columns do not contain missing value
df %>%
  filter(
    if_all(
      .cols = where(is.numeric),
      .fns = ~ !is.na(.)
    )
  )
## # A tibble: 2 × 4
##   item_name group value1 value2
##   <chr>     <dbl>  <dbl>  <dbl>
## 1 a             1      1      4
## 2 b             2      4      7
# how to create new columns based conditions across multiple columns
billboard %>%
  mutate(
    top_song = case_when(
      if_any(
        .cols = contains("wk"),
        .fns = ~ . == 1
      ) ~ "top song",
      TRUE ~ "no top song"
    )
  ) %>%
  select(artist, track, top_song)
## # A tibble: 317 × 3
##    artist         track                   top_song   
##    <chr>          <chr>                   <chr>      
##  1 2 Pac          Baby Don't Cry (Keep... no top song
##  2 2Ge+her        The Hardest Part Of ... no top song
##  3 3 Doors Down   Kryptonite              no top song
##  4 3 Doors Down   Loser                   no top song
##  5 504 Boyz       Wobble Wobble           no top song
##  6 98^0           Give Me Just One Nig... no top song
##  7 A*Teens        Dancing Queen           no top song
##  8 Aaliyah        I Don't Wanna           no top song
##  9 Aaliyah        Try Again               top song   
## 10 Adams, Yolanda Open My Heart           no top song
## # … with 307 more rows
# using slice()
myresults <- slice(snowdata, 60:76)

# using a vector of indices with slice()
economics %>%
  rownames_to_column(var = "row_number") %>%
  slice(c(4, 8, 10))
## # A tibble: 3 × 7
##   row_number date         pce    pop psavert uempmed unemploy
##   <chr>      <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 4          1967-10-01  512. 199311    12.9     4.9     3143
## 2 8          1968-02-01  534. 199920    12.3     4.5     3001
## 3 10         1968-04-01  544  200208    12.3     4.6     2709
# removing the fist row in a data set
economics %>%
  slice(-1)
## # A tibble: 573 × 6
##    date         pce    pop psavert uempmed unemploy
##    <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
##  1 1967-08-01  510. 198911    12.6     4.7     2945
##  2 1967-09-01  516. 199113    11.9     4.6     2958
##  3 1967-10-01  512. 199311    12.9     4.9     3143
##  4 1967-11-01  517. 199498    12.8     4.7     3066
##  5 1967-12-01  525. 199657    11.8     4.8     3018
##  6 1968-01-01  531. 199808    11.7     5.1     2878
##  7 1968-02-01  534. 199920    12.3     4.5     3001
##  8 1968-03-01  544. 200056    11.7     4.1     2877
##  9 1968-04-01  544  200208    12.3     4.6     2709
## 10 1968-05-01  550. 200361    12       4.4     2740
## # … with 563 more rows
# removing the last row
economics %>%
  slice(-nrow(.))
## # A tibble: 573 × 6
##    date         pce    pop psavert uempmed unemploy
##    <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
##  1 1967-07-01  507. 198712    12.6     4.5     2944
##  2 1967-08-01  510. 198911    12.6     4.7     2945
##  3 1967-09-01  516. 199113    11.9     4.6     2958
##  4 1967-10-01  512. 199311    12.9     4.9     3143
##  5 1967-11-01  517. 199498    12.8     4.7     3066
##  6 1967-12-01  525. 199657    11.8     4.8     3018
##  7 1968-01-01  531. 199808    11.7     5.1     2878
##  8 1968-02-01  534. 199920    12.3     4.5     3001
##  9 1968-03-01  544. 200056    11.7     4.1     2877
## 10 1968-04-01  544  200208    12.3     4.6     2709
## # … with 563 more rows
# how to slice off the top and bottom of a data frame
# getting the data
survey_results <- tribble(
  ~id, ~name, ~pre, ~post,
  1, "Test", 4, 4,
  2, "Test", 6, 8,
  3, "Millner", 2, 9,
  4, "Josh", 4, 7,
  5, "Bob", 3, 4
)

# slicing the top n rows of a data frame
survey_results %>%
  slice_tail(
    n = nrow(.) - 2
  )
## # A tibble: 3 × 4
##      id name      pre  post
##   <dbl> <chr>   <dbl> <dbl>
## 1     3 Millner     2     9
## 2     4 Josh        4     7
## 3     5 Bob         3     4
# or
survey_results %>%
  slice_tail(n = 3)
## # A tibble: 3 × 4
##      id name      pre  post
##   <dbl> <chr>   <dbl> <dbl>
## 1     3 Millner     2     9
## 2     4 Josh        4     7
## 3     5 Bob         3     4
# or
survey_results %>%
  filter(name != "Test")
## # A tibble: 3 × 4
##      id name      pre  post
##   <dbl> <chr>   <dbl> <dbl>
## 1     3 Millner     2     9
## 2     4 Josh        4     7
## 3     5 Bob         3     4
# slicing the bottom n rows
survey_results %>%
  slice_head(
    n = 2
  )
## # A tibble: 2 × 4
##      id name    pre  post
##   <dbl> <chr> <dbl> <dbl>
## 1     1 Test      4     4
## 2     2 Test      6     8
# how to slice rows with the highest and lowest values in a given column
# with filter
economics %>%
  filter(unemploy >= sort(.$unemploy, decreasing = TRUE)[10]) %>%
  arrange(desc(unemploy)) %>%
  select(date, unemploy)
## # A tibble: 10 × 2
##    date       unemploy
##    <date>        <dbl>
##  1 2009-10-01    15352
##  2 2010-04-01    15325
##  3 2009-11-01    15219
##  4 2010-03-01    15202
##  5 2010-02-01    15113
##  6 2009-12-01    15098
##  7 2010-11-01    15081
##  8 2010-01-01    15046
##  9 2009-09-01    15009
## 10 2010-05-01    14849
# with slice_max()
economics %>%
  slice_max(
    order_by = unemploy,
    n = 10
  ) %>%
  select(date, unemploy)
## # A tibble: 10 × 2
##    date       unemploy
##    <date>        <dbl>
##  1 2009-10-01    15352
##  2 2010-04-01    15325
##  3 2009-11-01    15219
##  4 2010-03-01    15202
##  5 2010-02-01    15113
##  6 2009-12-01    15098
##  7 2010-11-01    15081
##  8 2010-01-01    15046
##  9 2009-09-01    15009
## 10 2010-05-01    14849
# the percentage of rows with the highest value
economics %>%
  slice_max(
    order_by = unemploy,
    prop = .1
  )
## # A tibble: 57 × 6
##    date          pce     pop psavert uempmed unemploy
##    <date>      <dbl>   <dbl>   <dbl>   <dbl>    <dbl>
##  1 2009-10-01  9932. 308189      5.4    18.9    15352
##  2 2010-04-01 10113. 309191.     6.4    22.1    15325
##  3 2009-11-01  9940. 308418      5.9    19.8    15219
##  4 2010-03-01 10089. 309212      5.7    20.4    15202
##  5 2010-02-01 10031. 309027      5.8    19.9    15113
##  6 2009-12-01  9999. 308633      5.9    20.1    15098
##  7 2010-11-01 10355. 310596.     6.6    21      15081
##  8 2010-01-01 10002. 308833      6.1    20      15046
##  9 2009-09-01  9883. 307946      5.9    17.8    15009
## 10 2010-05-01 10131  309369.     7      22.3    14849
## # … with 47 more rows
# rows with the lowest values in a given column
economics %>%
  slice_min(
    order_by = unemploy,
    n = 3
  )
## # A tibble: 3 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1968-12-01  576. 201621    11.1     4.4     2685
## 2 1968-09-01  568. 201095    10.6     4.6     2686
## 3 1968-10-01  572. 201290    10.8     4.8     2689
# how to combine the slice functions with group_by
(highest_unemploy_per_month <- economics %>%
  group_by(year = year(date)) %>%
  slice_max(
    order_by = unemploy,
    n = 1
  ) %>%
  ungroup()) # At the end we terminate the grouping function with ungroup. Otherwise we would not apply the next functions to the whole data frame, but to the individual groups.
## # A tibble: 49 × 7
##    date         pce    pop psavert uempmed unemploy  year
##    <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl> <int>
##  1 1967-10-01  512. 199311    12.9     4.9     3143  1967
##  2 1968-02-01  534. 199920    12.3     4.5     3001  1968
##  3 1969-10-01  618. 203302    11.4     4.5     3049  1969
##  4 1970-12-01  666. 206238    13.2     5.9     5076  1970
##  5 1971-11-01  721. 208555    13.1     6.4     5161  1971
##  6 1972-03-01  749. 209212    11.8     6.6     5038  1972
##  7 1973-12-01  877. 212785    14.8     4.7     4489  1973
##  8 1974-12-01  962. 214782    14       5.7     6636  1974
##  9 1975-05-01 1019. 215523    17.3     9.4     8433  1975
## 10 1976-11-01 1189  218834    11.4     8.4     7620  1976
## # … with 39 more rows
# visualizing the results
highest_unemploy_per_month %>%
  mutate(
    month = month(date) %>% as.factor()
  ) %>%
  count(month) %>%
  ggplot(aes(x = month, y = n)) +
  geom_col()

# how to create bootstraps with slice_sample
economics %>%
  slice_sample(n = 20)
## # A tibble: 20 × 6
##    date          pce     pop psavert uempmed unemploy
##    <date>      <dbl>   <dbl>   <dbl>   <dbl>    <dbl>
##  1 2002-05-01  7283. 287623      6.5     9.5     8399
##  2 2003-12-01  7929. 292008      5.4    10.4     8317
##  3 1973-12-01   877. 212785     14.8     4.7     4489
##  4 2005-06-01  8725. 295936      2.9     9       7524
##  5 2010-10-01 10305. 310400.     6.6    21.2    14516
##  6 1974-02-01   890. 213074     14.2     5.1     4731
##  7 1976-12-01  1212. 219006     10.6     8       7545
##  8 1995-05-01  4933. 265998      7.1     9.1     7430
##  9 2010-08-01 10228. 309958.     6.9    21      14648
## 10 1976-06-01  1140. 217861     11.4     7.8     7322
## 11 2011-11-01 10760. 312830.     7      20.8    13302
## 12 1969-04-01   594. 202161      9.7     4       2758
## 13 1978-08-01  1447  222805     10.5     5.8     6080
## 14 1989-07-01  3586. 247342      8.2     5.6     6495
## 15 1987-09-01  3126. 243223      7.6     6       7102
## 16 1980-08-01  1764. 227953     11.3     7.5     8281
## 17 1971-12-01   728. 208740     13       6.2     5154
## 18 1969-10-01   618. 203302     11.4     4.5     3049
## 19 1994-03-01  4646  262352      6.8     9.3     8470
## 20 1968-08-01   567  200898     10.5     4.2     2768
# setting replacement
set.seed(455)
(sample_with_replacement <- economics %>%
  slice_sample(prop = 1, replace = TRUE))
## # A tibble: 574 × 6
##    date          pce     pop psavert uempmed unemploy
##    <date>      <dbl>   <dbl>   <dbl>   <dbl>    <dbl>
##  1 1968-01-01   531. 199808     11.7     5.1     2878
##  2 1968-07-01   563. 200706     10.7     4.5     2883
##  3 2010-03-01 10089. 309212      5.7    20.4    15202
##  4 1969-06-01   601. 202507     11.1     4.4     2816
##  5 1968-09-01   568. 201095     10.6     4.6     2686
##  6 1978-09-01  1453. 223053     10.6     5.6     6125
##  7 1975-10-01  1061. 216587     13.4     8.6     7897
##  8 1980-11-01  1827. 228612     11.6     7.7     8023
##  9 1967-08-01   510. 198911     12.6     4.7     2945
## 10 2012-09-01 11062. 314647.     8.2    18.8    12115
## # … with 564 more rows
# getting duplicates
sample_with_replacement %>%
  janitor::get_dupes()
## # A tibble: 363 × 7
##    date         pce    pop psavert uempmed unemploy dupe_count
##    <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>      <int>
##  1 1967-08-01  510. 198911    12.6     4.7     2945          2
##  2 1967-08-01  510. 198911    12.6     4.7     2945          2
##  3 1968-02-01  534. 199920    12.3     4.5     3001          3
##  4 1968-02-01  534. 199920    12.3     4.5     3001          3
##  5 1968-02-01  534. 199920    12.3     4.5     3001          3
##  6 1968-07-01  563. 200706    10.7     4.5     2883          3
##  7 1968-07-01  563. 200706    10.7     4.5     2883          3
##  8 1968-07-01  563. 200706    10.7     4.5     2883          3
##  9 1968-09-01  568. 201095    10.6     4.6     2686          2
## 10 1968-09-01  568. 201095    10.6     4.6     2686          2
## # … with 353 more rows
# creating bootstrap
bootstraps <- map(1:2000, ~ slice_sample(economics, prop = 1, replace = TRUE))
bootstraps %>% head(n = 2)
## [[1]]
## # A tibble: 574 × 6
##    date         pce    pop psavert uempmed unemploy
##    <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
##  1 1992-04-01 4132. 255992     9.9     8.5     9415
##  2 1993-09-01 4512. 260867     6.9     8.3     8714
##  3 1999-05-01 6226. 278717     4.9     6.5     5796
##  4 1989-07-01 3586. 247342     8.2     5.6     6495
##  5 1986-05-01 2858. 240271     9.3     6.8     8439
##  6 2006-11-01 9380. 300094     3.9     8.3     6872
##  7 2000-05-01 6708. 281877     4.9     5.8     5758
##  8 1996-12-01 5379. 271125     6.4     7.8     7253
##  9 1992-10-01 4285. 257861     8       9       9398
## 10 1970-11-01  657. 206024    13.6     5.6     4898
## # … with 564 more rows
## 
## [[2]]
## # A tibble: 574 × 6
##    date          pce     pop psavert uempmed unemploy
##    <date>      <dbl>   <dbl>   <dbl>   <dbl>    <dbl>
##  1 1979-11-01  1657. 226027      9.7     5.3     6238
##  2 1992-07-01  4205. 256894      9.6     8.6     9850
##  3 1975-03-01   991. 215198     12.7     7.2     7978
##  4 1970-07-01   648. 205052     13.5     5.1     4175
##  5 2013-11-01 11488. 317228.     6.2    17.1    10787
##  6 1999-12-01  6539. 280716      4.4     5.8     5653
##  7 1978-10-01  1467. 223271     10.7     5.9     5947
##  8 1980-04-01  1695. 227061     11.3     5.8     7358
##  9 1995-12-01  5098. 267943      6.1     8.3     7423
## 10 1993-11-01  4554. 261425      6.3     8.3     8542
## # … with 564 more rows
# getting the mean from the bootstraps
means <- map_dbl(bootstraps, ~ mean(.$unemploy))
p1 <- ggplot(NULL, aes(x = means)) +
  geom_histogram(fill = "grey80", color = "black") +
  labs(x = "means slice_sample")

# getting the mean of our data set (pretty close to the mean value of our bootstraps!)
economics$unemploy %>% mean
## [1] 7771.31
# comparing our bootstraps with the one created with tidymodels
boostraps_tidymodels <- rsample::bootstraps(economics, times = 2000)
means_tidymodels <- map_dbl(
  boostraps_tidymodels$splits,
  ~ mean(rsample::analysis(.)$unemploy)
)

p2 <- ggplot(NULL, aes(x = means_tidymodels)) +
  geom_histogram(fill = "grey80", color = "black")

(p1 | p2)

# arranging data frames
# ascending order
arrange(snowdata, Boston)
##       Winter Boston Chicago  NYC
## 1  2011-2012    9.3    19.8  7.4
## 2  1972-1973   10.3    32.9  2.8
## 3  1979-1980   12.5    42.4 12.8
## 4  1994-1995   14.9    24.1 11.8
## 5  2001-2002   15.1    31.1  3.5
## 6  1988-1989   15.5    24.5  8.1
## 7  2006-2007   17.1    35.6 12.4
## 8  1985-1986   18.1    29.0 13.0
## 9  1990-1991   19.1    36.7 24.9
## 10 1946-1947   19.4    34.1 30.6
## 11 1991-1992   22.0    28.4 12.6
## 12 1980-1981   22.3    35.0 19.4
## 13 1953-1954   23.6    43.2 15.8
## 14 1941-1942   23.9    29.8 11.3
## 15 1999-2000   24.4    30.3 16.3
## 16 1954-1955   25.1    32.2 11.5
## 17 1997-1998   25.6    29.6  5.5
## 18 1984-1985   26.6    39.1 24.1
## 19 1978-1979   27.5    89.7 29.4
## 20 1974-1975   27.6    52.2 13.1
## 21 1943-1944   27.7    24.0 23.8
## 22 1950-1951   29.7    54.4 11.6
## 23 1952-1953   29.8    23.4 15.1
## 24 1962-1963   30.9    42.7 16.3
## 25 1951-1952   31.9    66.4 19.7
## 26 1949-1950   32.0    33.8 13.8
## 27 1982-1983   32.7    26.6 27.2
## 28 1958-1959   34.1    41.0 13.0
## 29 2009-2010   35.7    54.2 51.4
## 30 2015-2016   36.2    31.2 32.1
## 31 1998-1999   36.4    50.9 12.7
## 32 1973-1974   36.9    58.3 23.5
## 33 1948-1949   37.1    14.3 46.6
## 34 1989-1990   39.2    33.8 13.4
## 35 2003-2004   39.4    24.8 42.6
## 36 2005-2006   39.9    26.0 40.0
## 37 1959-1960   40.9    50.9 39.2
## 38 1986-1987   42.5    26.2 23.1
## 39 1983-1984   43.0    49.0 25.4
## 40 1965-1966   44.1    24.9 21.4
## 41 1957-1958   44.7    20.0 44.7
## 42 1961-1962   44.7    58.9 18.1
## 43 1967-1968   44.8    28.4 19.5
## 44 1942-1943   45.7    45.2 29.5
## 45 2000-2001   45.9    39.2 35.0
## 46 1975-1976   46.6    43.3 17.3
## 47 1971-1972   47.5    46.8 22.9
## 48 1940-1941   47.8    52.5 39.0
## 49 1969-1970   48.8    77.0 25.6
## 50 1964-1965   50.4    59.5 24.4
## 51 1945-1946   50.8    23.9 31.4
## 52 2007-2008   51.2    60.3 11.9
## 53 1996-1997   51.9    40.6 10.0
## 54 1956-1957   52.0    31.3 21.9
## 55 1987-1988   52.6    42.6 19.1
## 56 1968-1969   53.8    29.4 30.2
## 57 1970-1971   57.3    37.9 15.5
## 58 1976-1977   58.5    54.1 24.5
## 59 2013-2014   58.9    82.0 57.4
## 60 1944-1945   59.2    34.9 27.1
## 61 1966-1967   60.1    68.4 51.5
## 62 1955-1956   60.9    26.3 33.5
## 63 1960-1961   61.5    40.7 54.7
## 64 1981-1982   61.8    59.3 24.6
## 65 1963-1964   63.0    35.2 44.7
## 66 2012-2013   63.4    30.1 26.1
## 67 2008-2009   65.9    52.7 27.6
## 68 2002-2003   70.9    28.6 49.3
## 69 2010-2011   81.0    57.9 61.9
## 70 1992-1993   83.9    46.9 24.5
## 71 1977-1978   85.1    82.3 50.7
## 72 2004-2005   86.6    39.4 41.0
## 73 1947-1948   89.2    38.1 63.2
## 74 1993-1994   96.3    41.8 53.4
## 75 1995-1996  107.6    23.9 75.6
## 76 2014-2015  110.6    50.7 50.3
# descencing order
arrange(snowdata, desc(Boston))
##       Winter Boston Chicago  NYC
## 1  2014-2015  110.6    50.7 50.3
## 2  1995-1996  107.6    23.9 75.6
## 3  1993-1994   96.3    41.8 53.4
## 4  1947-1948   89.2    38.1 63.2
## 5  2004-2005   86.6    39.4 41.0
## 6  1977-1978   85.1    82.3 50.7
## 7  1992-1993   83.9    46.9 24.5
## 8  2010-2011   81.0    57.9 61.9
## 9  2002-2003   70.9    28.6 49.3
## 10 2008-2009   65.9    52.7 27.6
## 11 2012-2013   63.4    30.1 26.1
## 12 1963-1964   63.0    35.2 44.7
## 13 1981-1982   61.8    59.3 24.6
## 14 1960-1961   61.5    40.7 54.7
## 15 1955-1956   60.9    26.3 33.5
## 16 1966-1967   60.1    68.4 51.5
## 17 1944-1945   59.2    34.9 27.1
## 18 2013-2014   58.9    82.0 57.4
## 19 1976-1977   58.5    54.1 24.5
## 20 1970-1971   57.3    37.9 15.5
## 21 1968-1969   53.8    29.4 30.2
## 22 1987-1988   52.6    42.6 19.1
## 23 1956-1957   52.0    31.3 21.9
## 24 1996-1997   51.9    40.6 10.0
## 25 2007-2008   51.2    60.3 11.9
## 26 1945-1946   50.8    23.9 31.4
## 27 1964-1965   50.4    59.5 24.4
## 28 1969-1970   48.8    77.0 25.6
## 29 1940-1941   47.8    52.5 39.0
## 30 1971-1972   47.5    46.8 22.9
## 31 1975-1976   46.6    43.3 17.3
## 32 2000-2001   45.9    39.2 35.0
## 33 1942-1943   45.7    45.2 29.5
## 34 1967-1968   44.8    28.4 19.5
## 35 1957-1958   44.7    20.0 44.7
## 36 1961-1962   44.7    58.9 18.1
## 37 1965-1966   44.1    24.9 21.4
## 38 1983-1984   43.0    49.0 25.4
## 39 1986-1987   42.5    26.2 23.1
## 40 1959-1960   40.9    50.9 39.2
## 41 2005-2006   39.9    26.0 40.0
## 42 2003-2004   39.4    24.8 42.6
## 43 1989-1990   39.2    33.8 13.4
## 44 1948-1949   37.1    14.3 46.6
## 45 1973-1974   36.9    58.3 23.5
## 46 1998-1999   36.4    50.9 12.7
## 47 2015-2016   36.2    31.2 32.1
## 48 2009-2010   35.7    54.2 51.4
## 49 1958-1959   34.1    41.0 13.0
## 50 1982-1983   32.7    26.6 27.2
## 51 1949-1950   32.0    33.8 13.8
## 52 1951-1952   31.9    66.4 19.7
## 53 1962-1963   30.9    42.7 16.3
## 54 1952-1953   29.8    23.4 15.1
## 55 1950-1951   29.7    54.4 11.6
## 56 1943-1944   27.7    24.0 23.8
## 57 1974-1975   27.6    52.2 13.1
## 58 1978-1979   27.5    89.7 29.4
## 59 1984-1985   26.6    39.1 24.1
## 60 1997-1998   25.6    29.6  5.5
## 61 1954-1955   25.1    32.2 11.5
## 62 1999-2000   24.4    30.3 16.3
## 63 1941-1942   23.9    29.8 11.3
## 64 1953-1954   23.6    43.2 15.8
## 65 1980-1981   22.3    35.0 19.4
## 66 1991-1992   22.0    28.4 12.6
## 67 1946-1947   19.4    34.1 30.6
## 68 1990-1991   19.1    36.7 24.9
## 69 1985-1986   18.1    29.0 13.0
## 70 2006-2007   17.1    35.6 12.4
## 71 1988-1989   15.5    24.5  8.1
## 72 2001-2002   15.1    31.1  3.5
## 73 1994-1995   14.9    24.1 11.8
## 74 1979-1980   12.5    42.4 12.8
## 75 1972-1973   10.3    32.9  2.8
## 76 2011-2012    9.3    19.8  7.4
# by a second column
arrange(snowdata, Boston, NYC)
##       Winter Boston Chicago  NYC
## 1  2011-2012    9.3    19.8  7.4
## 2  1972-1973   10.3    32.9  2.8
## 3  1979-1980   12.5    42.4 12.8
## 4  1994-1995   14.9    24.1 11.8
## 5  2001-2002   15.1    31.1  3.5
## 6  1988-1989   15.5    24.5  8.1
## 7  2006-2007   17.1    35.6 12.4
## 8  1985-1986   18.1    29.0 13.0
## 9  1990-1991   19.1    36.7 24.9
## 10 1946-1947   19.4    34.1 30.6
## 11 1991-1992   22.0    28.4 12.6
## 12 1980-1981   22.3    35.0 19.4
## 13 1953-1954   23.6    43.2 15.8
## 14 1941-1942   23.9    29.8 11.3
## 15 1999-2000   24.4    30.3 16.3
## 16 1954-1955   25.1    32.2 11.5
## 17 1997-1998   25.6    29.6  5.5
## 18 1984-1985   26.6    39.1 24.1
## 19 1978-1979   27.5    89.7 29.4
## 20 1974-1975   27.6    52.2 13.1
## 21 1943-1944   27.7    24.0 23.8
## 22 1950-1951   29.7    54.4 11.6
## 23 1952-1953   29.8    23.4 15.1
## 24 1962-1963   30.9    42.7 16.3
## 25 1951-1952   31.9    66.4 19.7
## 26 1949-1950   32.0    33.8 13.8
## 27 1982-1983   32.7    26.6 27.2
## 28 1958-1959   34.1    41.0 13.0
## 29 2009-2010   35.7    54.2 51.4
## 30 2015-2016   36.2    31.2 32.1
## 31 1998-1999   36.4    50.9 12.7
## 32 1973-1974   36.9    58.3 23.5
## 33 1948-1949   37.1    14.3 46.6
## 34 1989-1990   39.2    33.8 13.4
## 35 2003-2004   39.4    24.8 42.6
## 36 2005-2006   39.9    26.0 40.0
## 37 1959-1960   40.9    50.9 39.2
## 38 1986-1987   42.5    26.2 23.1
## 39 1983-1984   43.0    49.0 25.4
## 40 1965-1966   44.1    24.9 21.4
## 41 1961-1962   44.7    58.9 18.1
## 42 1957-1958   44.7    20.0 44.7
## 43 1967-1968   44.8    28.4 19.5
## 44 1942-1943   45.7    45.2 29.5
## 45 2000-2001   45.9    39.2 35.0
## 46 1975-1976   46.6    43.3 17.3
## 47 1971-1972   47.5    46.8 22.9
## 48 1940-1941   47.8    52.5 39.0
## 49 1969-1970   48.8    77.0 25.6
## 50 1964-1965   50.4    59.5 24.4
## 51 1945-1946   50.8    23.9 31.4
## 52 2007-2008   51.2    60.3 11.9
## 53 1996-1997   51.9    40.6 10.0
## 54 1956-1957   52.0    31.3 21.9
## 55 1987-1988   52.6    42.6 19.1
## 56 1968-1969   53.8    29.4 30.2
## 57 1970-1971   57.3    37.9 15.5
## 58 1976-1977   58.5    54.1 24.5
## 59 2013-2014   58.9    82.0 57.4
## 60 1944-1945   59.2    34.9 27.1
## 61 1966-1967   60.1    68.4 51.5
## 62 1955-1956   60.9    26.3 33.5
## 63 1960-1961   61.5    40.7 54.7
## 64 1981-1982   61.8    59.3 24.6
## 65 1963-1964   63.0    35.2 44.7
## 66 2012-2013   63.4    30.1 26.1
## 67 2008-2009   65.9    52.7 27.6
## 68 2002-2003   70.9    28.6 49.3
## 69 2010-2011   81.0    57.9 61.9
## 70 1992-1993   83.9    46.9 24.5
## 71 1977-1978   85.1    82.3 50.7
## 72 2004-2005   86.6    39.4 41.0
## 73 1947-1948   89.2    38.1 63.2
## 74 1993-1994   96.3    41.8 53.4
## 75 1995-1996  107.6    23.9 75.6
## 76 2014-2015  110.6    50.7 50.3
# selecting columns
select(snowdata, Winter, Boston)
##       Winter Boston
## 1  1940-1941   47.8
## 2  1941-1942   23.9
## 3  1942-1943   45.7
## 4  1943-1944   27.7
## 5  1944-1945   59.2
## 6  1945-1946   50.8
## 7  1946-1947   19.4
## 8  1947-1948   89.2
## 9  1948-1949   37.1
## 10 1949-1950   32.0
## 11 1950-1951   29.7
## 12 1951-1952   31.9
## 13 1952-1953   29.8
## 14 1953-1954   23.6
## 15 1954-1955   25.1
## 16 1955-1956   60.9
## 17 1956-1957   52.0
## 18 1957-1958   44.7
## 19 1958-1959   34.1
## 20 1959-1960   40.9
## 21 1960-1961   61.5
## 22 1961-1962   44.7
## 23 1962-1963   30.9
## 24 1963-1964   63.0
## 25 1964-1965   50.4
## 26 1965-1966   44.1
## 27 1966-1967   60.1
## 28 1967-1968   44.8
## 29 1968-1969   53.8
## 30 1969-1970   48.8
## 31 1970-1971   57.3
## 32 1971-1972   47.5
## 33 1972-1973   10.3
## 34 1973-1974   36.9
## 35 1974-1975   27.6
## 36 1975-1976   46.6
## 37 1976-1977   58.5
## 38 1977-1978   85.1
## 39 1978-1979   27.5
## 40 1979-1980   12.5
## 41 1980-1981   22.3
## 42 1981-1982   61.8
## 43 1982-1983   32.7
## 44 1983-1984   43.0
## 45 1984-1985   26.6
## 46 1985-1986   18.1
## 47 1986-1987   42.5
## 48 1987-1988   52.6
## 49 1988-1989   15.5
## 50 1989-1990   39.2
## 51 1990-1991   19.1
## 52 1991-1992   22.0
## 53 1992-1993   83.9
## 54 1993-1994   96.3
## 55 1994-1995   14.9
## 56 1995-1996  107.6
## 57 1996-1997   51.9
## 58 1997-1998   25.6
## 59 1998-1999   36.4
## 60 1999-2000   24.4
## 61 2000-2001   45.9
## 62 2001-2002   15.1
## 63 2002-2003   70.9
## 64 2003-2004   39.4
## 65 2004-2005   86.6
## 66 2005-2006   39.9
## 67 2006-2007   17.1
## 68 2007-2008   51.2
## 69 2008-2009   65.9
## 70 2009-2010   35.7
## 71 2010-2011   81.0
## 72 2011-2012    9.3
## 73 2012-2013   63.4
## 74 2013-2014   58.9
## 75 2014-2015  110.6
## 76 2015-2016   36.2
select(snowdata, Boston:NYC)
##    Boston Chicago  NYC
## 1    47.8    52.5 39.0
## 2    23.9    29.8 11.3
## 3    45.7    45.2 29.5
## 4    27.7    24.0 23.8
## 5    59.2    34.9 27.1
## 6    50.8    23.9 31.4
## 7    19.4    34.1 30.6
## 8    89.2    38.1 63.2
## 9    37.1    14.3 46.6
## 10   32.0    33.8 13.8
## 11   29.7    54.4 11.6
## 12   31.9    66.4 19.7
## 13   29.8    23.4 15.1
## 14   23.6    43.2 15.8
## 15   25.1    32.2 11.5
## 16   60.9    26.3 33.5
## 17   52.0    31.3 21.9
## 18   44.7    20.0 44.7
## 19   34.1    41.0 13.0
## 20   40.9    50.9 39.2
## 21   61.5    40.7 54.7
## 22   44.7    58.9 18.1
## 23   30.9    42.7 16.3
## 24   63.0    35.2 44.7
## 25   50.4    59.5 24.4
## 26   44.1    24.9 21.4
## 27   60.1    68.4 51.5
## 28   44.8    28.4 19.5
## 29   53.8    29.4 30.2
## 30   48.8    77.0 25.6
## 31   57.3    37.9 15.5
## 32   47.5    46.8 22.9
## 33   10.3    32.9  2.8
## 34   36.9    58.3 23.5
## 35   27.6    52.2 13.1
## 36   46.6    43.3 17.3
## 37   58.5    54.1 24.5
## 38   85.1    82.3 50.7
## 39   27.5    89.7 29.4
## 40   12.5    42.4 12.8
## 41   22.3    35.0 19.4
## 42   61.8    59.3 24.6
## 43   32.7    26.6 27.2
## 44   43.0    49.0 25.4
## 45   26.6    39.1 24.1
## 46   18.1    29.0 13.0
## 47   42.5    26.2 23.1
## 48   52.6    42.6 19.1
## 49   15.5    24.5  8.1
## 50   39.2    33.8 13.4
## 51   19.1    36.7 24.9
## 52   22.0    28.4 12.6
## 53   83.9    46.9 24.5
## 54   96.3    41.8 53.4
## 55   14.9    24.1 11.8
## 56  107.6    23.9 75.6
## 57   51.9    40.6 10.0
## 58   25.6    29.6  5.5
## 59   36.4    50.9 12.7
## 60   24.4    30.3 16.3
## 61   45.9    39.2 35.0
## 62   15.1    31.1  3.5
## 63   70.9    28.6 49.3
## 64   39.4    24.8 42.6
## 65   86.6    39.4 41.0
## 66   39.9    26.0 40.0
## 67   17.1    35.6 12.4
## 68   51.2    60.3 11.9
## 69   65.9    52.7 27.6
## 70   35.7    54.2 51.4
## 71   81.0    57.9 61.9
## 72    9.3    19.8  7.4
## 73   63.4    30.1 26.1
## 74   58.9    82.0 57.4
## 75  110.6    50.7 50.3
## 76   36.2    31.2 32.1
select(snowdata, contains("C"))
##    Chicago  NYC
## 1     52.5 39.0
## 2     29.8 11.3
## 3     45.2 29.5
## 4     24.0 23.8
## 5     34.9 27.1
## 6     23.9 31.4
## 7     34.1 30.6
## 8     38.1 63.2
## 9     14.3 46.6
## 10    33.8 13.8
## 11    54.4 11.6
## 12    66.4 19.7
## 13    23.4 15.1
## 14    43.2 15.8
## 15    32.2 11.5
## 16    26.3 33.5
## 17    31.3 21.9
## 18    20.0 44.7
## 19    41.0 13.0
## 20    50.9 39.2
## 21    40.7 54.7
## 22    58.9 18.1
## 23    42.7 16.3
## 24    35.2 44.7
## 25    59.5 24.4
## 26    24.9 21.4
## 27    68.4 51.5
## 28    28.4 19.5
## 29    29.4 30.2
## 30    77.0 25.6
## 31    37.9 15.5
## 32    46.8 22.9
## 33    32.9  2.8
## 34    58.3 23.5
## 35    52.2 13.1
## 36    43.3 17.3
## 37    54.1 24.5
## 38    82.3 50.7
## 39    89.7 29.4
## 40    42.4 12.8
## 41    35.0 19.4
## 42    59.3 24.6
## 43    26.6 27.2
## 44    49.0 25.4
## 45    39.1 24.1
## 46    29.0 13.0
## 47    26.2 23.1
## 48    42.6 19.1
## 49    24.5  8.1
## 50    33.8 13.4
## 51    36.7 24.9
## 52    28.4 12.6
## 53    46.9 24.5
## 54    41.8 53.4
## 55    24.1 11.8
## 56    23.9 75.6
## 57    40.6 10.0
## 58    29.6  5.5
## 59    50.9 12.7
## 60    30.3 16.3
## 61    39.2 35.0
## 62    31.1  3.5
## 63    28.6 49.3
## 64    24.8 42.6
## 65    39.4 41.0
## 66    26.0 40.0
## 67    35.6 12.4
## 68    60.3 11.9
## 69    52.7 27.6
## 70    54.2 51.4
## 71    57.9 61.9
## 72    19.8  7.4
## 73    30.1 26.1
## 74    82.0 57.4
## 75    50.7 50.3
## 76    31.2 32.1
select(snowdata, ends_with("C"))
##     NYC
## 1  39.0
## 2  11.3
## 3  29.5
## 4  23.8
## 5  27.1
## 6  31.4
## 7  30.6
## 8  63.2
## 9  46.6
## 10 13.8
## 11 11.6
## 12 19.7
## 13 15.1
## 14 15.8
## 15 11.5
## 16 33.5
## 17 21.9
## 18 44.7
## 19 13.0
## 20 39.2
## 21 54.7
## 22 18.1
## 23 16.3
## 24 44.7
## 25 24.4
## 26 21.4
## 27 51.5
## 28 19.5
## 29 30.2
## 30 25.6
## 31 15.5
## 32 22.9
## 33  2.8
## 34 23.5
## 35 13.1
## 36 17.3
## 37 24.5
## 38 50.7
## 39 29.4
## 40 12.8
## 41 19.4
## 42 24.6
## 43 27.2
## 44 25.4
## 45 24.1
## 46 13.0
## 47 23.1
## 48 19.1
## 49  8.1
## 50 13.4
## 51 24.9
## 52 12.6
## 53 24.5
## 54 53.4
## 55 11.8
## 56 75.6
## 57 10.0
## 58  5.5
## 59 12.7
## 60 16.3
## 61 35.0
## 62  3.5
## 63 49.3
## 64 42.6
## 65 41.0
## 66 40.0
## 67 12.4
## 68 11.9
## 69 27.6
## 70 51.4
## 71 61.9
## 72  7.4
## 73 26.1
## 74 57.4
## 75 50.3
## 76 32.1
select(snowdata, -ends_with("C"))
##       Winter Boston Chicago
## 1  1940-1941   47.8    52.5
## 2  1941-1942   23.9    29.8
## 3  1942-1943   45.7    45.2
## 4  1943-1944   27.7    24.0
## 5  1944-1945   59.2    34.9
## 6  1945-1946   50.8    23.9
## 7  1946-1947   19.4    34.1
## 8  1947-1948   89.2    38.1
## 9  1948-1949   37.1    14.3
## 10 1949-1950   32.0    33.8
## 11 1950-1951   29.7    54.4
## 12 1951-1952   31.9    66.4
## 13 1952-1953   29.8    23.4
## 14 1953-1954   23.6    43.2
## 15 1954-1955   25.1    32.2
## 16 1955-1956   60.9    26.3
## 17 1956-1957   52.0    31.3
## 18 1957-1958   44.7    20.0
## 19 1958-1959   34.1    41.0
## 20 1959-1960   40.9    50.9
## 21 1960-1961   61.5    40.7
## 22 1961-1962   44.7    58.9
## 23 1962-1963   30.9    42.7
## 24 1963-1964   63.0    35.2
## 25 1964-1965   50.4    59.5
## 26 1965-1966   44.1    24.9
## 27 1966-1967   60.1    68.4
## 28 1967-1968   44.8    28.4
## 29 1968-1969   53.8    29.4
## 30 1969-1970   48.8    77.0
## 31 1970-1971   57.3    37.9
## 32 1971-1972   47.5    46.8
## 33 1972-1973   10.3    32.9
## 34 1973-1974   36.9    58.3
## 35 1974-1975   27.6    52.2
## 36 1975-1976   46.6    43.3
## 37 1976-1977   58.5    54.1
## 38 1977-1978   85.1    82.3
## 39 1978-1979   27.5    89.7
## 40 1979-1980   12.5    42.4
## 41 1980-1981   22.3    35.0
## 42 1981-1982   61.8    59.3
## 43 1982-1983   32.7    26.6
## 44 1983-1984   43.0    49.0
## 45 1984-1985   26.6    39.1
## 46 1985-1986   18.1    29.0
## 47 1986-1987   42.5    26.2
## 48 1987-1988   52.6    42.6
## 49 1988-1989   15.5    24.5
## 50 1989-1990   39.2    33.8
## 51 1990-1991   19.1    36.7
## 52 1991-1992   22.0    28.4
## 53 1992-1993   83.9    46.9
## 54 1993-1994   96.3    41.8
## 55 1994-1995   14.9    24.1
## 56 1995-1996  107.6    23.9
## 57 1996-1997   51.9    40.6
## 58 1997-1998   25.6    29.6
## 59 1998-1999   36.4    50.9
## 60 1999-2000   24.4    30.3
## 61 2000-2001   45.9    39.2
## 62 2001-2002   15.1    31.1
## 63 2002-2003   70.9    28.6
## 64 2003-2004   39.4    24.8
## 65 2004-2005   86.6    39.4
## 66 2005-2006   39.9    26.0
## 67 2006-2007   17.1    35.6
## 68 2007-2008   51.2    60.3
## 69 2008-2009   65.9    52.7
## 70 2009-2010   35.7    54.2
## 71 2010-2011   81.0    57.9
## 72 2011-2012    9.3    19.8
## 73 2012-2013   63.4    30.1
## 74 2013-2014   58.9    82.0
## 75 2014-2015  110.6    50.7
## 76 2015-2016   36.2    31.2
snowdata_numeric <- select_if(snowdata, is.numeric)
psych::describe(snowdata_numeric)
##         vars  n  mean    sd median trimmed   mad  min   max range skew kurtosis
## Boston     1 76 44.49 22.51  42.75   42.37 22.54  9.3 110.6 101.3 0.84     0.46
## Chicago    2 76 40.88 15.71  38.00   39.21 14.23 14.3  89.7  75.4 0.97     0.66
## NYC        3 76 27.05 15.89  24.25   25.64 16.01  2.8  75.6  72.8 0.84     0.03
##           se
## Boston  2.58
## Chicago 1.80
## NYC     1.82
# subsetting by row and column numbers
lastrow <- nrow(snowdata)
snowdata[lastrow, ]
##       Winter Boston Chicago  NYC
## 76 2015-2016   36.2    31.2 32.1
# one line of code gives you the same exact result as another
identical(snowdata[76,], snowdata[lastrow,])
## [1] TRUE
# getting the lowest and the highest value
range(snowdata$Boston)
## [1]   9.3 110.6
# pulling the row from snowdata that has the lowest Boston winter snow total
slice(snowdata, which.min(Boston))
##      Winter Boston Chicago NYC
## 1 2011-2012    9.3    19.8 7.4
# tidyselect
# relocating columns with everything()
mpg %>%
  select(manufacturer, cyl, everything()) %>%
  glimpse()
## Rows: 234
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
## $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
## $ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
## $ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
## $ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1…
## $ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2…
## $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
## $ class        <chr> "compact", "compact", "compact", "compact", "compact", "c…
# selecting the last column
mpg %>%
  select(last_col()) %>%
  glimpse()
## Rows: 234
## Columns: 1
## $ class <chr> "compact", "compact", "compact", "compact", "compact", "compact"…
# selecting all columns except the last one
mpg %>%
  select(!last_col()) %>%
  glimpse()
## Rows: 234
## Columns: 10
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
## $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
## $ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
## $ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
## $ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
## $ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1…
## $ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2…
## $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
# you can use last_col to select the n-to-last column
mpg %>%
  select(last_col(1)) %>%
  glimpse()
## Rows: 234
## Columns: 1
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
# select all columns that start with the letter “m”
mpg %>%
  select(starts_with("m")) %>%
  glimpse()
## Rows: 234
## Columns: 2
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
mpg %>%
  select(ends_with(c("l", "r"))) %>%
  glimpse()
## Rows: 234
## Columns: 6
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
## $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
## $ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
## $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
# selecting columns that contain certain strings
mpg %>%
  select(contains("m")) %>%
  glimpse()
## Rows: 234
## Columns: 2
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
mpg %>%
  rename(Manufacturer = manufacturer) %>%
  select(contains("m", ignore.case = FALSE)) %>%
  glimpse()
## Rows: 234
## Columns: 1
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "a4 quat…
# selecting all columns that contain a number
billboard %>%
  select(matches("\\d")) %>%
  colnames()
##  [1] "wk1"  "wk2"  "wk3"  "wk4"  "wk5"  "wk6"  "wk7"  "wk8"  "wk9"  "wk10"
## [11] "wk11" "wk12" "wk13" "wk14" "wk15" "wk16" "wk17" "wk18" "wk19" "wk20"
## [21] "wk21" "wk22" "wk23" "wk24" "wk25" "wk26" "wk27" "wk28" "wk29" "wk30"
## [31] "wk31" "wk32" "wk33" "wk34" "wk35" "wk36" "wk37" "wk38" "wk39" "wk40"
## [41] "wk41" "wk42" "wk43" "wk44" "wk45" "wk46" "wk47" "wk48" "wk49" "wk50"
## [51] "wk51" "wk52" "wk53" "wk54" "wk55" "wk56" "wk57" "wk58" "wk59" "wk60"
## [61] "wk61" "wk62" "wk63" "wk64" "wk65" "wk66" "wk67" "wk68" "wk69" "wk70"
## [71] "wk71" "wk72" "wk73" "wk74" "wk75" "wk76"
billboard %>%
  select(matches("wk\\d{1}$")) %>%
  colnames()
## [1] "wk1" "wk2" "wk3" "wk4" "wk5" "wk6" "wk7" "wk8" "wk9"
anscombe %>%
  select(matches("[xy][1-2]")) %>%
  glimpse()
## Rows: 11
## Columns: 4
## $ x1 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
## $ x2 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
## $ y1 <dbl> 8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68
## $ y2 <dbl> 9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74
# selecting columns with number ranges
anscombe %>%
  select(num_range("x", 1:2)) %>%
  glimpse()
## Rows: 11
## Columns: 2
## $ x1 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
## $ x2 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
billboard %>%
  select(num_range("wk", 1:15)) %>%
  glimpse()
## Rows: 317
## Columns: 15
## $ wk1  <dbl> 87, 91, 81, 76, 57, 51, 97, 84, 59, 76, 84, 57, 50, 71, 79, 80, 9…
## $ wk2  <dbl> 82, 87, 70, 76, 34, 39, 97, 62, 53, 76, 84, 47, 39, 51, 65, 78, 9…
## $ wk3  <dbl> 72, 92, 68, 72, 25, 34, 96, 51, 38, 74, 75, 45, 30, 28, 53, 76, 9…
## $ wk4  <dbl> 77, NA, 67, 69, 17, 26, 95, 41, 28, 69, 73, 29, 28, 18, 48, 77, 9…
## $ wk5  <dbl> 87, NA, 66, 67, 17, 26, 100, 38, 21, 68, 73, 23, 21, 13, 45, 92, …
## $ wk6  <dbl> 94, NA, 57, 65, 31, 19, NA, 35, 18, 67, 69, 18, 19, 13, 36, NA, 9…
## $ wk7  <dbl> 99, NA, 54, 55, 36, 2, NA, 35, 16, 61, 68, 11, 20, 11, 34, NA, 93…
## $ wk8  <dbl> NA, NA, 53, 59, 49, 2, NA, 38, 14, 58, 65, 9, 17, 1, 29, NA, 96, …
## $ wk9  <dbl> NA, NA, 51, 62, 53, 3, NA, 38, 12, 57, 73, 9, 17, 1, 27, NA, NA, …
## $ wk10 <dbl> NA, NA, 51, 61, 57, 6, NA, 36, 10, 59, 83, 11, 17, 2, 30, NA, NA,…
## $ wk11 <dbl> NA, NA, 51, 61, 64, 7, NA, 37, 9, 66, 92, 1, 17, 2, 36, NA, 99, N…
## $ wk12 <dbl> NA, NA, 51, 59, 70, 22, NA, 37, 8, 68, NA, 1, 3, 3, 37, NA, NA, 9…
## $ wk13 <dbl> NA, NA, 47, 61, 75, 29, NA, 38, 6, 61, NA, 1, 3, 3, 39, NA, 96, N…
## $ wk14 <dbl> NA, NA, 44, 66, 76, 36, NA, 49, 1, 67, NA, 1, 7, 4, 49, NA, 96, N…
## $ wk15 <dbl> NA, NA, 38, 72, 78, 47, NA, 61, 2, 59, NA, 4, 10, 12, 57, NA, 99,…
# selecting columns of a specific type
billboard %>%
  select(where(is.character)) %>%
  glimpse()
## Rows: 317
## Columns: 2
## $ artist <chr> "2 Pac", "2Ge+her", "3 Doors Down", "3 Doors Down", "504 Boyz",…
## $ track  <chr> "Baby Don't Cry (Keep...", "The Hardest Part Of ...", "Kryptoni…
# combining selections
mpg %>%
  select(where(is.character) & contains("l")) %>%
  glimpse()
## Rows: 234
## Columns: 3
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "a4 quat…
## $ fl    <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p",…
## $ class <chr> "compact", "compact", "compact", "compact", "compact", "compact"…
mpg %>%
  select(where(is.character) | contains("l")) %>%
  glimpse()
## Rows: 234
## Columns: 8
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
## $ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
## $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
## $ class        <chr> "compact", "compact", "compact", "compact", "compact", "c…
## $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
## $ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
# how to change the variable type across many columns
mpg %>%
  mutate(
    across(
      .cols = where(is.character),
      .fns = as_factor,
      .names = "{.col}_as_factor"
    )
  ) %>%
  dplyr::select(where(is.factor) | where(is.character)) %>%
  glimpse()
## Rows: 234
## Columns: 12
## $ manufacturer_as_factor <fct> audi, audi, audi, audi, audi, audi, audi, audi,…
## $ model_as_factor        <fct> a4, a4, a4, a4, a4, a4, a4, a4 quattro, a4 quat…
## $ trans_as_factor        <fct> auto(l5), manual(m5), manual(m6), auto(av), aut…
## $ drv_as_factor          <fct> f, f, f, f, f, f, f, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ fl_as_factor           <fct> p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p,…
## $ class_as_factor        <fct> compact, compact, compact, compact, compact, co…
## $ manufacturer           <chr> "audi", "audi", "audi", "audi", "audi", "audi",…
## $ model                  <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 q…
## $ trans                  <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(a…
## $ drv                    <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4…
## $ fl                     <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
## $ class                  <chr> "compact", "compact", "compact", "compact", "co…
# how to normalize many columns (mean of 0 and a standard deviation of 1)
scaled_columns <- mpg %>%
  transmute(
    across(
      .cols = where(is.numeric),
      .fns = ~ scale(.)[,1],
    )
  )

scaled_columns %>%
  glimpse()
## Rows: 234
## Columns: 5
## $ displ <dbl> -1.2939999, -1.2939999, -1.1391962, -1.1391962, -0.5199816, -0.5…
## $ year  <dbl> -0.997861, -0.997861, 0.997861, 0.997861, -0.997861, -0.997861, …
## $ cyl   <dbl> -1.1721058, -1.1721058, -1.1721058, -1.1721058, 0.0689474, 0.068…
## $ cty   <dbl> 0.26810155, 0.97299777, 0.73803236, 0.97299777, -0.20182926, 0.2…
## $ hwy   <dbl> 0.93369639, 0.93369639, 1.26956872, 1.10163255, 0.42988788, 0.42…
# checking the results
scaled_columns %>%
  dplyr::summarise(
    across(
      .cols = everything(),
      .fns = list(mean = mean, sd = sd)
    )
  ) %>%
  glimpse()
## Rows: 1
## Columns: 10
## $ displ_mean <dbl> -0.00000000000000008166417
## $ displ_sd   <dbl> 1
## $ year_mean  <dbl> 0
## $ year_sd    <dbl> 1
## $ cyl_mean   <dbl> -0.0000000000000002693067
## $ cyl_sd     <dbl> 1
## $ cty_mean   <dbl> 0.0000000000000002980539
## $ cty_sd     <dbl> 1
## $ hwy_mean   <dbl> -0.00000000000000001290316
## $ hwy_sd     <dbl> 1
# how to impute values across many columns
# getting the data
(dframe <- tibble(
  group = c("a", "a", "a", "b", "b", "b"),
  x = c(3, 5, 4, NA, 4, 8),
  y = c(2, NA, 3, 1, 9, 7)
))
## # A tibble: 6 × 3
##   group     x     y
##   <chr> <dbl> <dbl>
## 1 a         3     2
## 2 a         5    NA
## 3 a         4     3
## 4 b        NA     1
## 5 b         4     9
## 6 b         8     7
# replace each missing value with the mean value of the respective column
dframe %>%
  mutate(
    across(
      .cols = c(x, y), # or everything()
      .fns = ~ ifelse(test = is.na(.),
        yes = mean(., na.rm = TRUE),
        no = .
      )
    )
  )
## # A tibble: 6 × 3
##   group     x     y
##   <chr> <dbl> <dbl>
## 1 a       3     2  
## 2 a       5     4.4
## 3 a       4     3  
## 4 b       4.8   1  
## 5 b       4     9  
## 6 b       8     7
# or
dframe %>%
  mutate(
    across(
      .cols = c(x, y), # or everything()
      .fns = ~ case_when(
        is.na(.) ~ mean(., na.rm = TRUE),
        TRUE ~ .
      )
    )
  )
## # A tibble: 6 × 3
##   group     x     y
##   <chr> <dbl> <dbl>
## 1 a       3     2  
## 2 a       5     4.4
## 3 a       4     3  
## 4 b       4.8   1  
## 5 b       4     9  
## 6 b       8     7
# you can also impute the values within groups
dframe %>%
  group_by(group) %>%
  mutate(
    across(
      .cols = c(x, y), # or everything()
      .fns = ~ case_when(
        is.na(.) ~ mean(., na.rm = TRUE),
        TRUE ~ .
      )
    )
  ) %>%
  ungroup()
## # A tibble: 6 × 3
##   group     x     y
##   <chr> <dbl> <dbl>
## 1 a         3   2  
## 2 a         5   2.5
## 3 a         4   3  
## 4 b         6   1  
## 5 b         4   9  
## 6 b         8   7
# how to replace characters across many columns
# getting the data
typo_dframe <- tribble(
  ~pre_test, ~post_test,
  "goud", "good",
  "medium", "good",
  "metium", "metium",
  "bad", "goud"
)

(typo_corrected <- typo_dframe %>%
  mutate(
    across(
      .cols = everything(),
      .fns = ~ case_when(
        str_detect(., "goud") ~ str_replace(., "goud", "good"),
        str_detect(., "metium") ~ str_replace(., "metium", "medium"),
        TRUE ~ .
      )
    )
  ))
## # A tibble: 4 × 2
##   pre_test post_test
##   <chr>    <chr>    
## 1 good     good     
## 2 medium   good     
## 3 medium   medium   
## 4 bad      good

6.3 Packages

data.table and the tidyverse packages offer simpler solutions and speed up the workflow for the types of problems we are dealing with. Both can be used for the same tasks. You can learn one of them or both. The syntax used for data.table is often more concise and arguably more consistent than that in dplyr (it is in essence an extension of the [i, j] notation that we have already used for data frames). Second, it is fast and memory-efficient, which makes a huge difference if you are working with big data. On the other hand, many people prefer the syntax in dplyr and tidyr, which lends itself exceptionally well for usage with pipes. If you work with small or medium-sized datasets, the difference in performance between the two packages is negligible. dplyr is also much better suited for working directly with databases, which is a huge selling point if your data already is in a database.

data.table relies heavily on the [i, j] notation that is used for data frames in R. It also adds a third element: [i, j, by]. Using this, R selects the rows indicated by i, the columns indicated by j and groups them by by. This makes it easy e.g. to compute grouped summaries. With the tidyverse packages you will instead use new functions with names like filter and summarise to perform operations on your data. These are typically combined using the pipe operator, %>%, which makes the code flow nicely from left to right.

data.table syntax, as it only works on data.table objects. Luckily, dplyr works perfectly when used on data.table objects. Note that when using data.table, there is not an explicit assignment. Note that when using data.table, there is not an explicit assignment. We don”t use <- to assign the new data frame to aq - instead the assignment happens automatically. This means that you have to be a little bit careful, so that you don”t inadvertently data when playing around with it.

When working with tidyverse packages, commands are usually chained together using %>% pipes. When using data.table, commands are chained by repeated use of [] brackets on the same line.

To change the name of a variable, we can use setnames from data.table or rename from dplyr.

You”ll frequently want to filter away some rows from your data. Perhaps you only want to select rows where a variable exceeds some value, or want to exclude rows with NA values. This can be done in several different ways: using row numbers, using conditions, at random, or using regular expressions.

In some situations you may wish to draw a random sample from your data. This is done using the sample (data.table) and sample_n (dplyr) functions.

In some cases, particularly when working with text data, you”ll want to filter using regular expressions. data.table has a convenience function called %like% that can be used to call grepl in an alternative (less opaque?) way. With dplyr we use grepl in the usual fashion.

Another common situation is that you want to remove some variables from your data. Perhaps the variables aren”t of interest in a particular analysis that you”re going to perform, or perhaps you”ve simply imported more variables than you need. As with rows, this can be done using numbers, names or regular expressions. When selecting a single column from a data frame, you sometimes want to extract the column as a vector and sometimes as a single-column data frame (for instance if you are going to pass it to a function that takes a data frame as input). You should be a little bit careful when doing this, to make sure that you get the column in the correct format.

In data.table, using regular expressions to select columns is done using grep. dplyr differs in that it has several convenience functions for selecting columns, like starts_with, ends_with, contains.

Sometimes you don”t want to filter rows, but rearrange their order according to their values for some variable. Similarly, you may want to change the order of the columns in your data. This is often useful for presentation purposes, but can at times also aid in analyses.

In some cases, you may want to fill missing values of a variable with the previous non-missing entry. To fill the missing values with the last non-missing entry, we can now use nafill or fill.

dplyr’s distinct() function will remove data frame rows based on duplication in certain columns. If you want to keep all the other variables in a data frame and not just the non-repetitive ones, distinct() needs the additional .keep_all = TRUE argument. Otherwise, it will return a data frame with only the columns you want to ensure haven’t been duplicated.

aq <- as.data.table(airquality)

# modifying a variable
# data.table
aq[, Wind := Wind * 0.44704]
# dplyr
aq %>% mutate(Wind = Wind * 0.44704) -> aq

# computing a new variable based on existing variables
# data.table
aq[, Hot := Temp > 90]
# dplyr
aq %>% mutate(Hot = Temp > 90) -> aq

# renaming a variable
# data.table
setnames(aq, "Hot", "HotDay")
# dplyr
# aq %>% rename(HotDay = Hot) -> aq

# removing a variable
# data.table
aq[, HotDay := NULL]
aq[, c("Month", "Day") := NULL] # multiple cols
# dplyr
# aq %>% select(-HotDay) -> aq
# aq %>% select(-Month, -Day) -> aq # multiple cols

# chaining commands
# data.table
aq <- as.data.table(airquality)
aq[, Month := nafill(Month, "locf")][, .N, Month]
##    Month  N
## 1:     5 31
## 2:     6 30
## 3:     7 31
## 4:     8 31
## 5:     9 30
# dplyr
aq %>%
  fill(Month) %>%
  group_by(Month, across(days = n()))
## # A tibble: 153 × 7
## # Groups:   Month, Ozone, Solar.R, Wind, Temp, Day, Hot [153]
##    Ozone Solar.R  Wind  Temp Month   Day Hot  
##    <int>   <int> <dbl> <int> <int> <int> <lgl>
##  1    41     190   7.4    67     5     1 FALSE
##  2    36     118   8      72     5     2 FALSE
##  3    12     149  12.6    74     5     3 FALSE
##  4    18     313  11.5    62     5     4 FALSE
##  5    NA      NA  14.3    56     5     5 FALSE
##  6    28      NA  14.9    66     5     6 FALSE
##  7    23     299   8.6    65     5     7 FALSE
##  8    19      99  13.8    59     5     8 FALSE
##  9     8      19  20.1    61     5     9 FALSE
## 10    NA     194   8.6    69     5    10 FALSE
## # … with 143 more rows
# realoading the data
aq <- as.data.table(airquality)

# filtering using row numbers
# data.table
aq[3, ]
##    Ozone Solar.R Wind Temp Month Day   Hot
## 1:    12     149 12.6   74     5   3 FALSE
aq[3:5, ]
##    Ozone Solar.R Wind Temp Month Day   Hot
## 1:    12     149 12.6   74     5   3 FALSE
## 2:    18     313 11.5   62     5   4 FALSE
## 3:    NA      NA 14.3   56     5   5 FALSE
aq[c(3, 7, 15), ]
##    Ozone Solar.R Wind Temp Month Day   Hot
## 1:    12     149 12.6   74     5   3 FALSE
## 2:    23     299  8.6   65     5   7 FALSE
## 3:    18      65 13.2   58     5  15 FALSE
aq[-c(3, 7, 15), ]
##      Ozone Solar.R Wind Temp Month Day   Hot
##   1:    41     190  7.4   67     5   1 FALSE
##   2:    36     118  8.0   72     5   2 FALSE
##   3:    18     313 11.5   62     5   4 FALSE
##   4:    NA      NA 14.3   56     5   5 FALSE
##   5:    28      NA 14.9   66     5   6 FALSE
##  ---                                        
## 146:    30     193  6.9   70     9  26 FALSE
## 147:    NA     145 13.2   77     9  27 FALSE
## 148:    14     191 14.3   75     9  28 FALSE
## 149:    18     131  8.0   76     9  29 FALSE
## 150:    20     223 11.5   68     9  30 FALSE
# dplyr
aq %>% slice(3)
##    Ozone Solar.R Wind Temp Month Day   Hot
## 1:    12     149 12.6   74     5   3 FALSE
aq %>% slice(3:5)
##    Ozone Solar.R Wind Temp Month Day   Hot
## 1:    12     149 12.6   74     5   3 FALSE
## 2:    18     313 11.5   62     5   4 FALSE
## 3:    NA      NA 14.3   56     5   5 FALSE
aq %>% slice(c(3, 7, 15))
##    Ozone Solar.R Wind Temp Month Day   Hot
## 1:    12     149 12.6   74     5   3 FALSE
## 2:    23     299  8.6   65     5   7 FALSE
## 3:    18      65 13.2   58     5  15 FALSE
aq %>% slice(-c(3, 7, 15))
##      Ozone Solar.R Wind Temp Month Day   Hot
##   1:    41     190  7.4   67     5   1 FALSE
##   2:    36     118  8.0   72     5   2 FALSE
##   3:    18     313 11.5   62     5   4 FALSE
##   4:    NA      NA 14.3   56     5   5 FALSE
##   5:    28      NA 14.9   66     5   6 FALSE
##  ---                                        
## 146:    30     193  6.9   70     9  26 FALSE
## 147:    NA     145 13.2   77     9  27 FALSE
## 148:    14     191 14.3   75     9  28 FALSE
## 149:    18     131  8.0   76     9  29 FALSE
## 150:    20     223 11.5   68     9  30 FALSE
# filtering using conditions
# data.table
aq[Temp > 90, ]
##     Ozone Solar.R Wind Temp Month Day  Hot
##  1:    NA     259 10.9   93     6  11 TRUE
##  2:    NA     250  9.2   92     6  12 TRUE
##  3:    97     267  6.3   92     7   8 TRUE
##  4:    97     272  5.7   92     7   9 TRUE
##  5:    NA     291 14.9   91     7  14 TRUE
##  6:    NA     222  8.6   92     8  10 TRUE
##  7:    76     203  9.7   97     8  28 TRUE
##  8:   118     225  2.3   94     8  29 TRUE
##  9:    84     237  6.3   96     8  30 TRUE
## 10:    85     188  6.3   94     8  31 TRUE
## 11:    96     167  6.9   91     9   1 TRUE
## 12:    78     197  5.1   92     9   2 TRUE
## 13:    73     183  2.8   93     9   3 TRUE
## 14:    91     189  4.6   93     9   4 TRUE
aq[Month == 6, ]
##     Ozone Solar.R Wind Temp Month Day   Hot
##  1:    NA     286  8.6   78     6   1 FALSE
##  2:    NA     287  9.7   74     6   2 FALSE
##  3:    NA     242 16.1   67     6   3 FALSE
##  4:    NA     186  9.2   84     6   4 FALSE
##  5:    NA     220  8.6   85     6   5 FALSE
##  6:    NA     264 14.3   79     6   6 FALSE
##  7:    29     127  9.7   82     6   7 FALSE
##  8:    NA     273  6.9   87     6   8 FALSE
##  9:    71     291 13.8   90     6   9 FALSE
## 10:    39     323 11.5   87     6  10 FALSE
## 11:    NA     259 10.9   93     6  11  TRUE
## 12:    NA     250  9.2   92     6  12  TRUE
## 13:    23     148  8.0   82     6  13 FALSE
## 14:    NA     332 13.8   80     6  14 FALSE
## 15:    NA     322 11.5   79     6  15 FALSE
## 16:    21     191 14.9   77     6  16 FALSE
## 17:    37     284 20.7   72     6  17 FALSE
## 18:    20      37  9.2   65     6  18 FALSE
## 19:    12     120 11.5   73     6  19 FALSE
## 20:    13     137 10.3   76     6  20 FALSE
## 21:    NA     150  6.3   77     6  21 FALSE
## 22:    NA      59  1.7   76     6  22 FALSE
## 23:    NA      91  4.6   76     6  23 FALSE
## 24:    NA     250  6.3   76     6  24 FALSE
## 25:    NA     135  8.0   75     6  25 FALSE
## 26:    NA     127  8.0   78     6  26 FALSE
## 27:    NA      47 10.3   73     6  27 FALSE
## 28:    NA      98 11.5   80     6  28 FALSE
## 29:    NA      31 14.9   77     6  29 FALSE
## 30:    NA     138  8.0   83     6  30 FALSE
##     Ozone Solar.R Wind Temp Month Day   Hot
aq[Temp > 90 & Month == 6, ]
##    Ozone Solar.R Wind Temp Month Day  Hot
## 1:    NA     259 10.9   93     6  11 TRUE
## 2:    NA     250  9.2   92     6  12 TRUE
aq[Temp %between% c(80, 90), ]
##     Ozone Solar.R Wind Temp Month Day   Hot
##  1:    45     252 14.9   81     5  29 FALSE
##  2:    NA     186  9.2   84     6   4 FALSE
##  3:    NA     220  8.6   85     6   5 FALSE
##  4:    29     127  9.7   82     6   7 FALSE
##  5:    NA     273  6.9   87     6   8 FALSE
##  6:    71     291 13.8   90     6   9 FALSE
##  7:    39     323 11.5   87     6  10 FALSE
##  8:    23     148  8.0   82     6  13 FALSE
##  9:    NA     332 13.8   80     6  14 FALSE
## 10:    NA      98 11.5   80     6  28 FALSE
## 11:    NA     138  8.0   83     6  30 FALSE
## 12:   135     269  4.1   84     7   1 FALSE
## 13:    49     248  9.2   85     7   2 FALSE
## 14:    32     236  9.2   81     7   3 FALSE
## 15:    NA     101 10.9   84     7   4 FALSE
## 16:    64     175  4.6   83     7   5 FALSE
## 17:    40     314 10.9   83     7   6 FALSE
## 18:    77     276  5.1   88     7   7 FALSE
## 19:    85     175  7.4   89     7  10 FALSE
## 20:    NA     139  8.6   82     7  11 FALSE
## 21:    27     175 14.9   81     7  13 FALSE
## 22:     7      48 14.3   80     7  15 FALSE
## 23:    48     260  6.9   81     7  16 FALSE
## 24:    35     274 10.3   82     7  17 FALSE
## 25:    61     285  6.3   84     7  18 FALSE
## 26:    79     187  5.1   87     7  19 FALSE
## 27:    63     220 11.5   85     7  20 FALSE
## 28:    NA     258  9.7   81     7  22 FALSE
## 29:    NA     295 11.5   82     7  23 FALSE
## 30:    80     294  8.6   86     7  24 FALSE
## 31:   108     223  8.0   85     7  25 FALSE
## 32:    20      81  8.6   82     7  26 FALSE
## 33:    52      82 12.0   86     7  27 FALSE
## 34:    82     213  7.4   88     7  28 FALSE
## 35:    50     275  7.4   86     7  29 FALSE
## 36:    64     253  7.4   83     7  30 FALSE
## 37:    59     254  9.2   81     7  31 FALSE
## 38:    39      83  6.9   81     8   1 FALSE
## 39:     9      24 13.8   81     8   2 FALSE
## 40:    16      77  7.4   82     8   3 FALSE
## 41:    78      NA  6.9   86     8   4 FALSE
## 42:    35      NA  7.4   85     8   5 FALSE
## 43:    66      NA  4.6   87     8   6 FALSE
## 44:   122     255  4.0   89     8   7 FALSE
## 45:    89     229 10.3   90     8   8 FALSE
## 46:   110     207  8.0   90     8   9 FALSE
## 47:    NA     137 11.5   86     8  11 FALSE
## 48:    44     192 11.5   86     8  12 FALSE
## 49:    28     273 11.5   82     8  13 FALSE
## 50:    65     157  9.7   80     8  14 FALSE
## 51:   168     238  3.4   81     8  25 FALSE
## 52:    73     215  8.0   86     8  26 FALSE
## 53:    NA     153  5.7   88     8  27 FALSE
## 54:    47      95  7.4   87     9   5 FALSE
## 55:    32      92 15.5   84     9   6 FALSE
## 56:    20     252 10.9   80     9   7 FALSE
## 57:    44     236 14.9   81     9  11 FALSE
## 58:    16     201  8.0   82     9  20 FALSE
## 59:    36     139 10.3   81     9  23 FALSE
##     Ozone Solar.R Wind Temp Month Day   Hot
aq[frankv(-Temp,
  ties.method = "min"
) <= 5, ]
##    Ozone Solar.R Wind Temp Month Day  Hot
## 1:    NA     259 10.9   93     6  11 TRUE
## 2:    76     203  9.7   97     8  28 TRUE
## 3:   118     225  2.3   94     8  29 TRUE
## 4:    84     237  6.3   96     8  30 TRUE
## 5:    85     188  6.3   94     8  31 TRUE
## 6:    73     183  2.8   93     9   3 TRUE
## 7:    91     189  4.6   93     9   4 TRUE
unique(aq) # removing duplicate rows
##      Ozone Solar.R Wind Temp Month Day   Hot
##   1:    41     190  7.4   67     5   1 FALSE
##   2:    36     118  8.0   72     5   2 FALSE
##   3:    12     149 12.6   74     5   3 FALSE
##   4:    18     313 11.5   62     5   4 FALSE
##   5:    NA      NA 14.3   56     5   5 FALSE
##  ---                                        
## 149:    30     193  6.9   70     9  26 FALSE
## 150:    NA     145 13.2   77     9  27 FALSE
## 151:    14     191 14.3   75     9  28 FALSE
## 152:    18     131  8.0   76     9  29 FALSE
## 153:    20     223 11.5   68     9  30 FALSE
na.omit(aq) # removing rows with missing data
##      Ozone Solar.R Wind Temp Month Day   Hot
##   1:    41     190  7.4   67     5   1 FALSE
##   2:    36     118  8.0   72     5   2 FALSE
##   3:    12     149 12.6   74     5   3 FALSE
##   4:    18     313 11.5   62     5   4 FALSE
##   5:    23     299  8.6   65     5   7 FALSE
##  ---                                        
## 107:    14      20 16.6   63     9  25 FALSE
## 108:    30     193  6.9   70     9  26 FALSE
## 109:    14     191 14.3   75     9  28 FALSE
## 110:    18     131  8.0   76     9  29 FALSE
## 111:    20     223 11.5   68     9  30 FALSE
na.omit(aq, "Ozone") # removing rows with missing Ozone values
##      Ozone Solar.R Wind Temp Month Day   Hot
##   1:    41     190  7.4   67     5   1 FALSE
##   2:    36     118  8.0   72     5   2 FALSE
##   3:    12     149 12.6   74     5   3 FALSE
##   4:    18     313 11.5   62     5   4 FALSE
##   5:    28      NA 14.9   66     5   6 FALSE
##  ---                                        
## 112:    14      20 16.6   63     9  25 FALSE
## 113:    30     193  6.9   70     9  26 FALSE
## 114:    14     191 14.3   75     9  28 FALSE
## 115:    18     131  8.0   76     9  29 FALSE
## 116:    20     223 11.5   68     9  30 FALSE
# dplyr
aq %>% filter(Temp > 90)
##     Ozone Solar.R Wind Temp Month Day  Hot
##  1:    NA     259 10.9   93     6  11 TRUE
##  2:    NA     250  9.2   92     6  12 TRUE
##  3:    97     267  6.3   92     7   8 TRUE
##  4:    97     272  5.7   92     7   9 TRUE
##  5:    NA     291 14.9   91     7  14 TRUE
##  6:    NA     222  8.6   92     8  10 TRUE
##  7:    76     203  9.7   97     8  28 TRUE
##  8:   118     225  2.3   94     8  29 TRUE
##  9:    84     237  6.3   96     8  30 TRUE
## 10:    85     188  6.3   94     8  31 TRUE
## 11:    96     167  6.9   91     9   1 TRUE
## 12:    78     197  5.1   92     9   2 TRUE
## 13:    73     183  2.8   93     9   3 TRUE
## 14:    91     189  4.6   93     9   4 TRUE
aq %>% filter(Month == 6)
##     Ozone Solar.R Wind Temp Month Day   Hot
##  1:    NA     286  8.6   78     6   1 FALSE
##  2:    NA     287  9.7   74     6   2 FALSE
##  3:    NA     242 16.1   67     6   3 FALSE
##  4:    NA     186  9.2   84     6   4 FALSE
##  5:    NA     220  8.6   85     6   5 FALSE
##  6:    NA     264 14.3   79     6   6 FALSE
##  7:    29     127  9.7   82     6   7 FALSE
##  8:    NA     273  6.9   87     6   8 FALSE
##  9:    71     291 13.8   90     6   9 FALSE
## 10:    39     323 11.5   87     6  10 FALSE
## 11:    NA     259 10.9   93     6  11  TRUE
## 12:    NA     250  9.2   92     6  12  TRUE
## 13:    23     148  8.0   82     6  13 FALSE
## 14:    NA     332 13.8   80     6  14 FALSE
## 15:    NA     322 11.5   79     6  15 FALSE
## 16:    21     191 14.9   77     6  16 FALSE
## 17:    37     284 20.7   72     6  17 FALSE
## 18:    20      37  9.2   65     6  18 FALSE
## 19:    12     120 11.5   73     6  19 FALSE
## 20:    13     137 10.3   76     6  20 FALSE
## 21:    NA     150  6.3   77     6  21 FALSE
## 22:    NA      59  1.7   76     6  22 FALSE
## 23:    NA      91  4.6   76     6  23 FALSE
## 24:    NA     250  6.3   76     6  24 FALSE
## 25:    NA     135  8.0   75     6  25 FALSE
## 26:    NA     127  8.0   78     6  26 FALSE
## 27:    NA      47 10.3   73     6  27 FALSE
## 28:    NA      98 11.5   80     6  28 FALSE
## 29:    NA      31 14.9   77     6  29 FALSE
## 30:    NA     138  8.0   83     6  30 FALSE
##     Ozone Solar.R Wind Temp Month Day   Hot
aq %>% filter(
  Temp > 90,
  Month == 6
)
##    Ozone Solar.R Wind Temp Month Day  Hot
## 1:    NA     259 10.9   93     6  11 TRUE
## 2:    NA     250  9.2   92     6  12 TRUE
aq %>% filter(dplyr::between(Temp, 70, 90))
##      Ozone Solar.R Wind Temp Month Day   Hot
##   1:    36     118  8.0   72     5   2 FALSE
##   2:    12     149 12.6   74     5   3 FALSE
##   3:     7      NA  6.9   74     5  11 FALSE
##   4:    11     320 16.6   73     5  22 FALSE
##   5:    45     252 14.9   81     5  29 FALSE
##  ---                                        
## 103:    36     139 10.3   81     9  23 FALSE
## 104:    30     193  6.9   70     9  26 FALSE
## 105:    NA     145 13.2   77     9  27 FALSE
## 106:    14     191 14.3   75     9  28 FALSE
## 107:    18     131  8.0   76     9  29 FALSE
aq %>% top_n(5, Temp)
##    Ozone Solar.R Wind Temp Month Day  Hot
## 1:    NA     259 10.9   93     6  11 TRUE
## 2:    76     203  9.7   97     8  28 TRUE
## 3:   118     225  2.3   94     8  29 TRUE
## 4:    84     237  6.3   96     8  30 TRUE
## 5:    85     188  6.3   94     8  31 TRUE
## 6:    73     183  2.8   93     9   3 TRUE
## 7:    91     189  4.6   93     9   4 TRUE
aq %>% distinct # removing duplicate rows based on duplication on certain columns
##      Ozone Solar.R Wind Temp Month Day   Hot
##   1:    41     190  7.4   67     5   1 FALSE
##   2:    36     118  8.0   72     5   2 FALSE
##   3:    12     149 12.6   74     5   3 FALSE
##   4:    18     313 11.5   62     5   4 FALSE
##   5:    NA      NA 14.3   56     5   5 FALSE
##  ---                                        
## 149:    30     193  6.9   70     9  26 FALSE
## 150:    NA     145 13.2   77     9  27 FALSE
## 151:    14     191 14.3   75     9  28 FALSE
## 152:    18     131  8.0   76     9  29 FALSE
## 153:    20     223 11.5   68     9  30 FALSE
# another example of distinct()
# contributions <- map_df(list.files("mayor_finance_reports",
#   full.names = TRUE
# ), rio::import) %>%
#   filter(City == "Framingham", !str_detect(tolower(Address), "box")) %>%
#   distinct(Contributor, Address, .keep_all = TRUE)

aq %>% drop_na # removing rows with missing data
##      Ozone Solar.R Wind Temp Month Day   Hot
##   1:    41     190  7.4   67     5   1 FALSE
##   2:    36     118  8.0   72     5   2 FALSE
##   3:    12     149 12.6   74     5   3 FALSE
##   4:    18     313 11.5   62     5   4 FALSE
##   5:    23     299  8.6   65     5   7 FALSE
##  ---                                        
## 107:    14      20 16.6   63     9  25 FALSE
## 108:    30     193  6.9   70     9  26 FALSE
## 109:    14     191 14.3   75     9  28 FALSE
## 110:    18     131  8.0   76     9  29 FALSE
## 111:    20     223 11.5   68     9  30 FALSE
aq %>% drop_na("Ozone") # removing rows with missing Ozone values
##      Ozone Solar.R Wind Temp Month Day   Hot
##   1:    41     190  7.4   67     5   1 FALSE
##   2:    36     118  8.0   72     5   2 FALSE
##   3:    12     149 12.6   74     5   3 FALSE
##   4:    18     313 11.5   62     5   4 FALSE
##   5:    28      NA 14.9   66     5   6 FALSE
##  ---                                        
## 112:    14      20 16.6   63     9  25 FALSE
## 113:    30     193  6.9   70     9  26 FALSE
## 114:    14     191 14.3   75     9  28 FALSE
## 115:    18     131  8.0   76     9  29 FALSE
## 116:    20     223 11.5   68     9  30 FALSE
# selecting rows at random
# data.table
aq[sample(.N, 5), ]
##    Ozone Solar.R Wind Temp Month Day   Hot
## 1:    13     112 11.5   71     9  15 FALSE
## 2:    NA      59  1.7   76     6  22 FALSE
## 3:    46     237  6.9   78     9  16 FALSE
## 4:    20     223 11.5   68     9  30 FALSE
## 5:    21     259 15.5   76     9  12 FALSE
# dplyr
aq %>% sample_n(5)
##    Ozone Solar.R Wind Temp Month Day   Hot
## 1:    NA      59  1.7   76     6  22 FALSE
## 2:    89     229 10.3   90     8   8 FALSE
## 3:    18     224 13.8   67     9  17 FALSE
## 4:    34     307 12.0   66     5  17 FALSE
## 5:    41     190  7.4   67     5   1 FALSE
# using regular expressions to select rows
dogs <- data.table(
  Name = c(
    "Bianca", "Bella", "Mimmi", "Daisy",
    "Ernst", "Smulan"
  ),
  Breed = c(
    "Greyhound", "Greyhound", "Pug", "Poodle",
    "Bedlington Terrier", "Boxer"
  ),
  Desc = c(
    "Fast, playful", "Fast, easily worried",
    "Intense, small, loud",
    "Majestic, protective, playful",
    "Playful, relaxed",
    "Loving, cuddly, playful"
  )
)

# data.table
dogs[Name %like% "^B", ]
##      Name     Breed                 Desc
## 1: Bianca Greyhound        Fast, playful
## 2:  Bella Greyhound Fast, easily worried
# or:
dogs[grepl("^B", Name), ]
##      Name     Breed                 Desc
## 1: Bianca Greyhound        Fast, playful
## 2:  Bella Greyhound Fast, easily worried
dogs[Desc %like% "[pP]layful", ]
##      Name              Breed                          Desc
## 1: Bianca          Greyhound                 Fast, playful
## 2:  Daisy             Poodle Majestic, protective, playful
## 3:  Ernst Bedlington Terrier              Playful, relaxed
## 4: Smulan              Boxer       Loving, cuddly, playful
# dplyr
dogs %>% filter(grepl("B[a-z]", Name))
##      Name     Breed                 Desc
## 1: Bianca Greyhound        Fast, playful
## 2:  Bella Greyhound Fast, easily worried
dogs %>% filter(grepl("[pP]layful", Desc))
##      Name              Breed                          Desc
## 1: Bianca          Greyhound                 Fast, playful
## 2:  Daisy             Poodle Majestic, protective, playful
## 3:  Ernst Bedlington Terrier              Playful, relaxed
## 4: Smulan              Boxer       Loving, cuddly, playful
# selecting a single column
# data.table
# Return a vector:
aq$Temp
##   [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
##  [26] 58 57 67 81 79 76 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73
##  [51] 76 77 76 76 76 75 78 73 80 77 83 84 85 81 84 83 83 88 92 92 89 82 73 81 91
##  [76] 80 81 82 84 87 85 74 81 82 86 85 82 86 88 86 83 81 81 81 82 86 85 87 89 90
## [101] 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81 86 88 97 94 96 94 91 92
## [126] 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63 70 77
## [151] 75 76 68
# or
aq[, Temp]
##   [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
##  [26] 58 57 67 81 79 76 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73
##  [51] 76 77 76 76 76 75 78 73 80 77 83 84 85 81 84 83 83 88 92 92 89 82 73 81 91
##  [76] 80 81 82 84 87 85 74 81 82 86 85 82 86 88 86 83 81 81 81 82 86 85 87 89 90
## [101] 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81 86 88 97 94 96 94 91 92
## [126] 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63 70 77
## [151] 75 76 68
# Return a data.table:
aq[, "Temp"]
##      Temp
##   1:   67
##   2:   72
##   3:   74
##   4:   62
##   5:   56
##  ---     
## 149:   70
## 150:   77
## 151:   75
## 152:   76
## 153:   68
aq[, .(Temp, Month, Day)]
##      Temp Month Day
##   1:   67     5   1
##   2:   72     5   2
##   3:   74     5   3
##   4:   62     5   4
##   5:   56     5   5
##  ---               
## 149:   70     9  26
## 150:   77     9  27
## 151:   75     9  28
## 152:   76     9  29
## 153:   68     9  30
aq[, Wind:Month]
##      Wind Temp Month
##   1:  7.4   67     5
##   2:  8.0   72     5
##   3: 12.6   74     5
##   4: 11.5   62     5
##   5: 14.3   56     5
##  ---                
## 149:  6.9   70     9
## 150: 13.2   77     9
## 151: 14.3   75     9
## 152:  8.0   76     9
## 153: 11.5   68     9
aq[, -c("Month", "Day")]
##      Ozone Solar.R Wind Temp   Hot
##   1:    41     190  7.4   67 FALSE
##   2:    36     118  8.0   72 FALSE
##   3:    12     149 12.6   74 FALSE
##   4:    18     313 11.5   62 FALSE
##   5:    NA      NA 14.3   56 FALSE
##  ---                              
## 149:    30     193  6.9   70 FALSE
## 150:    NA     145 13.2   77 FALSE
## 151:    14     191 14.3   75 FALSE
## 152:    18     131  8.0   76 FALSE
## 153:    20     223 11.5   68 FALSE
aq[, sapply(msleep, class) == "numeric"] # selecting all numeric variables
##         name        genus         vore        order conservation  sleep_total 
##        FALSE        FALSE        FALSE        FALSE        FALSE         TRUE 
##    sleep_rem  sleep_cycle        awake      brainwt       bodywt 
##         TRUE         TRUE         TRUE         TRUE         TRUE
aq[, .SD, .SDcols = colSums(is.na(aq)) == 0] # removing cols with missing values
##      Wind Temp Month Day   Hot
##   1:  7.4   67     5   1 FALSE
##   2:  8.0   72     5   2 FALSE
##   3: 12.6   74     5   3 FALSE
##   4: 11.5   62     5   4 FALSE
##   5: 14.3   56     5   5 FALSE
##  ---                          
## 149:  6.9   70     9  26 FALSE
## 150: 13.2   77     9  27 FALSE
## 151: 14.3   75     9  28 FALSE
## 152:  8.0   76     9  29 FALSE
## 153: 11.5   68     9  30 FALSE
# dplyr
# Return a vector:
aq$Temp
##   [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
##  [26] 58 57 67 81 79 76 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73
##  [51] 76 77 76 76 76 75 78 73 80 77 83 84 85 81 84 83 83 88 92 92 89 82 73 81 91
##  [76] 80 81 82 84 87 85 74 81 82 86 85 82 86 88 86 83 81 81 81 82 86 85 87 89 90
## [101] 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81 86 88 97 94 96 94 91 92
## [126] 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63 70 77
## [151] 75 76 68
# or
aq %>% pull(Temp)
##   [1] 67 72 74 62 56 66 65 59 61 69 74 69 66 68 58 64 66 57 68 62 59 73 61 61 57
##  [26] 58 57 67 81 79 76 78 74 67 84 85 79 82 87 90 87 93 92 82 80 79 77 72 65 73
##  [51] 76 77 76 76 76 75 78 73 80 77 83 84 85 81 84 83 83 88 92 92 89 82 73 81 91
##  [76] 80 81 82 84 87 85 74 81 82 86 85 82 86 88 86 83 81 81 81 82 86 85 87 89 90
## [101] 90 92 86 86 82 80 79 77 79 76 78 78 77 72 75 79 81 86 88 97 94 96 94 91 92
## [126] 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76 68 82 64 71 81 69 63 70 77
## [151] 75 76 68
# Return a tibble:
aq %>% select(Temp)
##      Temp
##   1:   67
##   2:   72
##   3:   74
##   4:   62
##   5:   56
##  ---     
## 149:   70
## 150:   77
## 151:   75
## 152:   76
## 153:   68
aq %>% select(Temp, Month, Day)
##      Temp Month Day
##   1:   67     5   1
##   2:   72     5   2
##   3:   74     5   3
##   4:   62     5   4
##   5:   56     5   5
##  ---               
## 149:   70     9  26
## 150:   77     9  27
## 151:   75     9  28
## 152:   76     9  29
## 153:   68     9  30
aq %>% select(Wind:Month)
##      Wind Temp Month
##   1:  7.4   67     5
##   2:  8.0   72     5
##   3: 12.6   74     5
##   4: 11.5   62     5
##   5: 14.3   56     5
##  ---                
## 149:  6.9   70     9
## 150: 13.2   77     9
## 151: 14.3   75     9
## 152:  8.0   76     9
## 153: 11.5   68     9
aq %>% select(-Month, -Day)
##      Ozone Solar.R Wind Temp   Hot
##   1:    41     190  7.4   67 FALSE
##   2:    36     118  8.0   72 FALSE
##   3:    12     149 12.6   74 FALSE
##   4:    18     313 11.5   62 FALSE
##   5:    NA      NA 14.3   56 FALSE
##  ---                              
## 149:    30     193  6.9   70 FALSE
## 150:    NA     145 13.2   77 FALSE
## 151:    14     191 14.3   75 FALSE
## 152:    18     131  8.0   76 FALSE
## 153:    20     223 11.5   68 FALSE
aq %>% select_if(is.numeric)
##      Ozone Solar.R Wind Temp Month Day
##   1:    41     190  7.4   67     5   1
##   2:    36     118  8.0   72     5   2
##   3:    12     149 12.6   74     5   3
##   4:    18     313 11.5   62     5   4
##   5:    NA      NA 14.3   56     5   5
##  ---                                  
## 149:    30     193  6.9   70     9  26
## 150:    NA     145 13.2   77     9  27
## 151:    14     191 14.3   75     9  28
## 152:    18     131  8.0   76     9  29
## 153:    20     223 11.5   68     9  30
aq %>% select_if(~all(!is.na(.)))
##      Wind Temp Month Day   Hot
##   1:  7.4   67     5   1 FALSE
##   2:  8.0   72     5   2 FALSE
##   3: 12.6   74     5   3 FALSE
##   4: 11.5   62     5   4 FALSE
##   5: 14.3   56     5   5 FALSE
##  ---                          
## 149:  6.9   70     9  26 FALSE
## 150: 13.2   77     9  27 FALSE
## 151: 14.3   75     9  28 FALSE
## 152:  8.0   76     9  29 FALSE
## 153: 11.5   68     9  30 FALSE
# using regular expressions to select columns
# data.table
vars <- grepl("n", names(aq))
aq[, ..vars]
##      Ozone Wind Month
##   1:    41  7.4     5
##   2:    36  8.0     5
##   3:    12 12.6     5
##   4:    18 11.5     5
##   5:    NA 14.3     5
##  ---                 
## 149:    30  6.9     9
## 150:    NA 13.2     9
## 151:    14 14.3     9
## 152:    18  8.0     9
## 153:    20 11.5     9
# dplyr
# contains is a convenience
# function for checking if a name
# contains a string:
aq %>% select(contains("n"))
##      Ozone Wind Month
##   1:    41  7.4     5
##   2:    36  8.0     5
##   3:    12 12.6     5
##   4:    18 11.5     5
##   5:    NA 14.3     5
##  ---                 
## 149:    30  6.9     9
## 150:    NA 13.2     9
## 151:    14 14.3     9
## 152:    18  8.0     9
## 153:    20 11.5     9
# matches can be used with any
# regular expression:
aq %>% select(matches("n"))
##      Ozone Wind Month
##   1:    41  7.4     5
##   2:    36  8.0     5
##   3:    12 12.6     5
##   4:    18 11.5     5
##   5:    NA 14.3     5
##  ---                 
## 149:    30  6.9     9
## 150:    NA 13.2     9
## 151:    14 14.3     9
## 152:    18  8.0     9
## 153:    20 11.5     9
# subsetting using columns numbers
# using column numbers can yield different results depending on what type of data table you"re using
aq <- as.data.frame(airquality)
str(aq[,2])
##  int [1:153] 190 118 149 313 NA NA 299 99 19 194 ...
# data.table:
aq <- as.data.table(airquality)
str(aq[,2])
## Classes 'data.table' and 'data.frame':   153 obs. of  1 variable:
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# tibble:
aq <- as_tibble(airquality)
str(aq[,2])
## tibble [153 × 1] (S3: tbl_df/tbl/data.frame)
##  $ Solar.R: int [1:153] 190 118 149 313 NA NA 299 99 19 194 ...
# aq[[2]]  works the same for data frames, data tables and tibbles, returning a vector
aq <- as.data.frame(airquality)
str(aq[[2]])
##  int [1:153] 190 118 149 313 NA NA 299 99 19 194 ...
# data.table:
aq <- as.data.table(airquality)
str(aq[[2]])
##  int [1:153] 190 118 149 313 NA NA 299 99 19 194 ...
# tibble:
aq <- as_tibble(airquality)
str(aq[[2]])
##  int [1:153] 190 118 149 313 NA NA 299 99 19 194 ...
# changing the column order
# data.table
setcolorder(aq, c("Month", "Day"))
# dplyr
aq %>% relocate("Month", "Day")
## # A tibble: 153 × 7
##    Month   Day Ozone Solar.R  Wind  Temp Hot  
##    <int> <int> <int>   <int> <dbl> <int> <lgl>
##  1     5     1    41     190   7.4    67 FALSE
##  2     5     2    36     118   8      72 FALSE
##  3     5     3    12     149  12.6    74 FALSE
##  4     5     4    18     313  11.5    62 FALSE
##  5     5     5    NA      NA  14.3    56 FALSE
##  6     5     6    28      NA  14.9    66 FALSE
##  7     5     7    23     299   8.6    65 FALSE
##  8     5     8    19      99  13.8    59 FALSE
##  9     5     9     8      19  20.1    61 FALSE
## 10     5    10    NA     194   8.6    69 FALSE
## # … with 143 more rows
# changing row order
# sorting a single vector
aq <- data.table(airquality)
sort(aq$Wind)
##   [1] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6
##  [38] 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7
##  [75] 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [112] 8 8 8 8 8 8 8 8 8 8 8 8 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
## [149] 9 9 9 9 9
sort(aq$Wind, decreasing = TRUE)
##   [1] 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 8 8 8 8 8 8 8
##  [38] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 7 7 7 7 7 7 7 7 7 7 7 7 7
##  [75] 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## [112] 6 6 6 6 6 6 6 6 6 6 6 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
## [149] 5 5 5 5 5
sort(c("C", "B", "A", "D"))
## [1] "A" "B" "C" "D"
# data.table
aq[order(Wind), ] # ascending order
##      Month Day Ozone Solar.R Wind Temp   Hot
##   1:    41 190   7.4      67    5    1 FALSE
##   2:    36 118   8.0      72    5    2 FALSE
##   3:    12 149  12.6      74    5    3 FALSE
##   4:    18 313  11.5      62    5    4 FALSE
##   5:    NA  NA  14.3      56    5    5 FALSE
##  ---                                        
## 149:    30 193   6.9      70    9   26 FALSE
## 150:    NA 145  13.2      77    9   27 FALSE
## 151:    14 191  14.3      75    9   28 FALSE
## 152:    18 131   8.0      76    9   29 FALSE
## 153:    20 223  11.5      68    9   30 FALSE
aq[order(-Wind), ] # descending order
##      Month Day Ozone Solar.R Wind Temp   Hot
##   1:    96 167   6.9      91    9    1  TRUE
##   2:    78 197   5.1      92    9    2  TRUE
##   3:    73 183   2.8      93    9    3  TRUE
##   4:    91 189   4.6      93    9    4  TRUE
##   5:    47  95   7.4      87    9    5 FALSE
##  ---                                        
## 149:    NA  NA   8.0      57    5   27 FALSE
## 150:    23  13  12.0      67    5   28 FALSE
## 151:    45 252  14.9      81    5   29 FALSE
## 152:   115 223   5.7      79    5   30 FALSE
## 153:    37 279   7.4      76    5   31 FALSE
aq[order(Temp, -Wind), ]
##      Month Day Ozone Solar.R Wind Temp   Hot
##   1:    96 167   6.9      91    9    1  TRUE
##   2:    39  83   6.9      81    8    1 FALSE
##   3:   135 269   4.1      84    7    1 FALSE
##   4:    NA 286   8.6      78    6    1 FALSE
##   5:    41 190   7.4      67    5    1 FALSE
##  ---                                        
## 149:    NA 138   8.0      83    6   30 FALSE
## 150:   115 223   5.7      79    5   30 FALSE
## 151:    85 188   6.3      94    8   31  TRUE
## 152:    59 254   9.2      81    7   31 FALSE
## 153:    37 279   7.4      76    5   31 FALSE
# dplyr
aq %>% arrange(Wind) # ascending order
##      Month Day Ozone Solar.R Wind Temp   Hot
##   1:    41 190   7.4      67    5    1 FALSE
##   2:    36 118   8.0      72    5    2 FALSE
##   3:    12 149  12.6      74    5    3 FALSE
##   4:    18 313  11.5      62    5    4 FALSE
##   5:    NA  NA  14.3      56    5    5 FALSE
##  ---                                        
## 149:    30 193   6.9      70    9   26 FALSE
## 150:    NA 145  13.2      77    9   27 FALSE
## 151:    14 191  14.3      75    9   28 FALSE
## 152:    18 131   8.0      76    9   29 FALSE
## 153:    20 223  11.5      68    9   30 FALSE
aq %>% arrange(-Wind) # descending order
##      Month Day Ozone Solar.R Wind Temp   Hot
##   1:    96 167   6.9      91    9    1  TRUE
##   2:    78 197   5.1      92    9    2  TRUE
##   3:    73 183   2.8      93    9    3  TRUE
##   4:    91 189   4.6      93    9    4  TRUE
##   5:    47  95   7.4      87    9    5 FALSE
##  ---                                        
## 149:    NA  NA   8.0      57    5   27 FALSE
## 150:    23  13  12.0      67    5   28 FALSE
## 151:    45 252  14.9      81    5   29 FALSE
## 152:   115 223   5.7      79    5   30 FALSE
## 153:    37 279   7.4      76    5   31 FALSE
# or
aq %>% arrange(desc(Wind))
##      Month Day Ozone Solar.R Wind Temp   Hot
##   1:    96 167   6.9      91    9    1  TRUE
##   2:    78 197   5.1      92    9    2  TRUE
##   3:    73 183   2.8      93    9    3  TRUE
##   4:    91 189   4.6      93    9    4  TRUE
##   5:    47  95   7.4      87    9    5 FALSE
##  ---                                        
## 149:    NA  NA   8.0      57    5   27 FALSE
## 150:    23  13  12.0      67    5   28 FALSE
## 151:    45 252  14.9      81    5   29 FALSE
## 152:   115 223   5.7      79    5   30 FALSE
## 153:    37 279   7.4      76    5   31 FALSE
aq %>% arrange(Temp, desc(Wind))
##      Month Day Ozone Solar.R Wind Temp   Hot
##   1:    96 167   6.9      91    9    1  TRUE
##   2:    39  83   6.9      81    8    1 FALSE
##   3:   135 269   4.1      84    7    1 FALSE
##   4:    NA 286   8.6      78    6    1 FALSE
##   5:    41 190   7.4      67    5    1 FALSE
##  ---                                        
## 149:    NA 138   8.0      83    6   30 FALSE
## 150:   115 223   5.7      79    5   30 FALSE
## 151:    85 188   6.3      94    8   31  TRUE
## 152:    59 254   9.2      81    7   31 FALSE
## 153:    37 279   7.4      76    5   31 FALSE
# filling in missing values
aq$Month[c(2:3, 36:39, 70)] <- NA
# data.table
aq[, Month := data.table::nafill(Month, "locf")] # fill the missing values with the last non-missing entry
aq[, Month := nafill(Month, "nocb")] # fill the missing values with the next non-missing entry
# dplyr
aq %>% fill(Month) -> aq 
aq %>% fill(Month, .direction = "up") -> aq

7 String manipulation

7.1 Base-R

Character values in R can be stored as scalars, vectors, or matrices, or they can be columns of a data frame or elements of a list. When applied to objects like this, the length function will report the number of character values in the object, not the number of characters in each string. To find the number of characters in a character value, the nchar function can be used. Like most functions in R, nchar is vectorized.

Like other objects in R, character values will be displayed when their name is typed at the console or when they are passed to the print function. However, it is often more convenient to print or display these objects directly without the subscripts that the print function provides. The cat function will combine character values and print them to the screen or a file directly. The cat function coerces its arguments to character values, then concatenates and displays them. This makes the function ideal for printing messages and warnings from inside of functions.

cat will always print a newline when it encounters a newline character. When there are multiple strings passed to cat, or when the argument to cat is a vector of character strings, the fill= argument can be used to automatically insert newlines into the output string. If fill= is set to TRUE, the value of the system width option will be used to determine the linesize; if a numeric value is used, the output will be displayed using that width, although cat will not insert newlines into individual elements of its input.

The cat function also accepts a file= argument to specify that its output should be directed to a file. When the file= argument is used, the append=TRUE argument can also be provided to have cat append its output to an already existing file. For more control over the way that character values are concatenated, the paste function can be used. In its simplest usage, this function will accept an unlimited number of scalars, and join them together, separating each scalar with a space by default. To use a character string other than a space as a separator, the sep= argument can be used. If any object passed to paste is not of mode character, it is converted to character. If a character vector is passed to paste, the collapse= argument can be used to specify a character string to place between each element of the vector.

When multiple arguments are passed to paste, it will vectorize the operation, recycling shorter elements when necessary. This makes it easy to generate variable names with a common prefix.

Individual characters of character values are not accessible through ordinary subscripting. Instead, the substring function can be used either to extract parts of character strings, or to change the values of parts of character strings. In addition to the string being operated on, substring accepts a first= argument giving the first character of the desired substring, and a last= argument giving the last character. If not specified, last= defaults to a large number, so that specifying just a start= value will operate from that character to the end of the string. Like most functions in R, substring is vectorized, operating on multiple strings at once. In the case of strings that have fewer characters than specified in the last= argument, substring returns as many characters as it finds with no padding provided.

For finding locations of particular characters within a character string, the string first needs to be converted to a character vector containing individual characters. This can be done by passing a vector consisting of all the characters to be processed as both the first= and last= arguments, and then applying which to the result.

Regular expressions are supported in the R functions strsplit, grep, sub, and gsub, as well as in the regexpr and gregexpr functions which are the main tools for working with regular expressions in R. Regular expression syntax varies depending on the particular implementation a program uses. R tries to provide a great deal of flexibility regarding the regular expressions it understands. By default, R uses a basic set of regular expressions similar to those used by UNIX utilities like grep. The extended=TRUE argument to R functions that support regular expressions extend the set of regular expressions to include those supported by the POSIX 1003.2 standard. To use regular expressions like those supported by scripting languages such as perl and python, the perl=TRUE argument can be used.

The backslash character (\) is used in regular expressions to signal that certain characters with special meaning in regular expressions should be treated as normal characters. In R, this means that two backslash characters need to be entered into an input string anywhere that special characters need to be escaped. Although the double backslash will display when the string is printed, nchar or cat can verify that only a single backslash is actually included in the string.

Single backslashes, like those which are part of a newline character (\n), will be interpreted correctly inside of regular expressions.

Modifers for regular expressions The strsplit function can use a character string or regular expression to divide up a character string into smaller pieces. The first argument to strsplit is the character string to break up, and the second argument is the character value or regular expression which should be used to break up the string into parts.Like other functions that can return different numbers of elements from their inputs, strsplit returns its results as a list, even when its input is a single character string. Because strsplit can accept regular expressions to decide where to split a character string, a wide variety of situations can be easily handled.

The grep function accepts a regular expression and a character string or vector of character strings, and returns the indices of those elements of the strings which are matched by the regular expression. If the value=TRUE argument is passed to grep, it will return the actual strings which matched the expression instead of the indices. If the string to be matched should be interpreted literally (i.e., not as a regular expression), the fixed=TRUE argument should be used. One important use of grep is to extract a set of variables from a data frame based on their names. To create a data frame with just these variables, we can use the output of grep as a subscript. To find regular expressions without regard to the case (upper or lower) of the input, the ignore.case=TRUE argument can be used.

Surrounding a string with escaped angle brackets (\\< and \\>) restricts matches to the case where the string is surrounded by either white space, punctuation, or a line ending or beginning. If the regular expression passed to grep is not matched in any of its inputs, grep returns an empty numeric vector. Thus, the any function can be used to test if a regular expression occurs anywhere in a vector of strings.

While the grep function can be used to test for the presence of a regular expression, sometimes more details regarding the matches that are found are needed. In R, the regexpr and gregexpr functions can be used to pinpoint and possibly extract those parts of a string that were matched by a regular expression. The output from these functions is a vector of starting positions of the regular expressions which were found; if no match occurred, a value of -1 is returned. In addition, an attribute called match.length is associated with the vector of starting positions to provide information about exactly which characters were involved in the match. The regexpr function will only provide information about the first match in its input string(s), while the gregexpr function returns information about all matches found. The input arguments to regexpr and gregexpr are similar to those of grep; however, the ignore.case=TRUE argument is not available in versions of R earlier than version 2.6.

Since regexpr only reports the first match it finds, it will always return a vector, with -1 in those positions where no match was found. To extract the strings that actually matched, substr can be used, after calculating the ending position from the regexpr output and the match.length attribute.

For substituting text based on regular expressions, R provides two functions: sub and gsub. Each of these functions accepts a regular expression, a string containing what will be substituted for the regular expression, and the string or strings to operate on. The sub function changes only the first occurrence of the regular expression, while the gsub function performs the substitution on all occurrences within the string. One important use of these functions concerns numeric data which is read from text sources like web pages or financial reports, and which may contain commas or dollar signs.

When using the substitution functions, a powerful feature known as tagging of regular expressions is available. When part of a regular expression is surrounded by (unescaped) parentheses, that part can be used in a substitution pattern by representing it as a backslash followed by a number. The first tagged pattern is represented by \\1, the second by \\2, and so on. To extract just the tagged pattern from a regular expression, one possibility is to use the regular expression beginning and end anchor characters (^ and $, respectively) to account for all the nontagged characters in the string, and specify just the tagged expression for the substitution string.

If you wish to create a number of similar strings based on information from other variables, you can use sprintf, which allows you to write a string using %s as a placeholder for the values that should be pulled from other variables.

# finding the lengths of the names of the states of USA
nchar(state.name)
##  [1]  7  6  7  8 10  8 11  8  7  7  6  5  8  7  4  6  8  9  5  8 13  8  9 11  8
## [26]  7  8  6 13 10 10  8 14 12  4  8  6 12 12 14 12  9  5  4  7  8 10 13  9  7
# using cat function
x = 7
y = 10
cat("x should be greater than y, but x=',x,' and y=',y,' x should be greater than y, but x = 7 and y = 10")
## x should be greater than y, but x=',x,' and y=',y,' x should be greater than y, but x = 7 and y = 10
# with fill argument
cat("Long strings can", "be displayed over", "several lines using", "the fill= argument", fill = 40)
## Long strings can be displayed over 
## several lines using the fill= argument
# using paste
paste("one", 2, "three", 4, "five")
## [1] "one 2 three 4 five"
paste(c("one", "two", "three", "four"), collapse = " ")
## [1] "one two three four"
paste(c("X", "Y"), 1:5, sep = "")
## [1] "X1" "Y2" "X3" "Y4" "X5"
paste(c("X", "Y"), 1:5, sep = "_", collapse = "|")
## [1] "X_1|Y_2|X_3|Y_4|X_5"
paste(c("X", "Y"), 1:5, sep = "_") # with space -> no collapse argument
## [1] "X_1" "Y_2" "X_3" "Y_4" "X_5"
# working with parts of character values
substring(state.name, 2, 6)
##  [1] "labam" "laska" "rizon" "rkans" "alifo" "olora" "onnec" "elawa" "lorid"
## [10] "eorgi" "awaii" "daho"  "llino" "ndian" "owa"   "ansas" "entuc" "ouisi"
## [19] "aine"  "aryla" "assac" "ichig" "innes" "issis" "issou" "ontan" "ebras"
## [28] "evada" "ew Ha" "ew Je" "ew Me" "ew Yo" "orth " "orth " "hio"   "klaho"
## [37] "regon" "ennsy" "hode " "outh " "outh " "ennes" "exas"  "tah"   "ermon"
## [46] "irgin" "ashin" "est V" "iscon" "yomin"
# finding location of particular characters
state <- "Mississippi"
ll <- nchar(state)
ltrs <- substring(state, 1:ll, 1:ll)
ltrs
##  [1] "M" "i" "s" "s" "i" "s" "s" "i" "p" "p" "i"
which(ltrs == "s")
## [1] 3 4 6 7
# regular expressions
expr <- ".*\\.txt"
nchar(expr)
## [1] 7
cat(expr, "\n")
## .*\.txt
# breaking apart character values
sentence <- "R is a free software environment for statistical computing"
parts <- strsplit(sentence, " ")
parts
## [[1]]
## [1] "R"           "is"          "a"           "free"        "software"   
## [6] "environment" "for"         "statistical" "computing"
length(parts)
## [1] 1
length(parts[[1]])
## [1] 9
sapply(parts, length)
## [1] 9
allparts <- unlist(parts)
allparts
## [1] "R"           "is"          "a"           "free"        "software"   
## [6] "environment" "for"         "statistical" "computing"
str <- "one two three four"
strsplit(str, " +")
## [[1]]
## [1] "one"   "two"   "three" "four"
# using regex
grep("^pop", names(LifeCycleSavings)) # indices
## [1] 2 3
grep("^pop", names(LifeCycleSavings), value = TRUE) # values
## [1] "pop15" "pop75"
# creating a data frame using grep as a subscript
head(LifeCycleSavings[, grep("^pop", names(LifeCycleSavings))])
##           pop15 pop75
## Australia 29.35  2.87
## Austria   23.32  4.41
## Belgium   23.80  4.43
## Bolivia   41.89  1.67
## Brazil    42.19  0.83
## Canada    31.72  2.85
# ignoring upper or lower case
inp <- c("run dog run", "work doggedly", "CAT AND DOG")
grep("\\<dog\\>", inp, ignore.case = TRUE)
## [1] 1 3
# checking if a regex occurs anywhere in text
str1 <- c("The R Foundation", "is a not for profit organization", "working in the public interest")
str2 <- c(" It was founded by the members", "of the R Core Team in order", "to provide support for the R project")
any(grep("profit", str1))
## [1] TRUE
any(grep("profit", str2))
## [1] FALSE
# using regexpr
tst <- c("one x7 two b1", "three c5 four b9", "five six seven", "a8 eight nine")
wh <- regexpr("[a-z][0-9]", tst)
wh
## [1]  5  7 -1  1
## attr(,"match.length")
## [1]  2  2 -1  2
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
res <- substring(tst, wh, wh + attr(wh, "match.length") - 1)
res
## [1] "x7" "c5" ""   "a8"
# using gregexpr
wh1 <- gregexpr("[a-z][0-9]", tst)
wh1
## [[1]]
## [1]  5 12
## attr(,"match.length")
## [1] 2 2
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[2]]
## [1]  7 15
## attr(,"match.length")
## [1] 2 2
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[4]]
## [1] 1
## attr(,"match.length")
## [1] 2
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
res1 = list()
for (i in 1:length(wh1)) {
  res1[[i]] <- substring(
    tst[i], wh1[[i]],
    wh1[[i]] +
      attr(wh1[[i]], "match.length") - 1
  )
}
res1
## [[1]]
## [1] "x7" "b1"
## 
## [[2]]
## [1] "c5" "b9"
## 
## [[3]]
## [1] ""
## 
## [[4]]
## [1] "a8"
# substitutions
values <- c("$11,317.35", "$11,234.51", "$11,275.89", "$11,278.93", "$11,294.94")
as.numeric(gsub("[$,]", "", values))
## [1] 11317.35 11234.51 11275.89 11278.93 11294.94
# tagging
values <- c("75.99", "(20.30)", "55.20")
as.numeric(gsub("\\(([0-9.]+)\\)", "-\\1", values))
## [1]  75.99 -20.30  55.20
# extracting the tagged pattern
str <- "report: 17 value=12 time=2:00"
sub("value=([^ ]+)", "\\1", str)
## [1] "report: 17 12 time=2:00"
sub("^.*value=([^ ]+).*$", "\\1", str)
## [1] "12"
# using variables into strings
names <- c("Irma", "Bea", "Lisa")
ages <- c(5, 59, 36)
sprintf("%s is %s years old.", names, ages)
## [1] "Irma is 5 years old."  "Bea is 59 years old."  "Lisa is 36 years old."

7.2 Tidyverse

Para trabajar con la manipulación de cadenas de texto usamos el paquete stringr, cuyas funciones siempre empiezan por str_*, seguidas por un verbo y el primer argumento, que siempre es un vector de caracteres. La mayoría de las funciones str_* usan expresiones regulares.

# data
geo <- pull(emisiones, 1)
# how many characters has every element
head( str_length(geo), 10)
##  [1] 36 73  7  8 14  7 48  7  7  6
# to lower
head (str_to_lower(geo), 10)
##  [1] "european union (current composition)"                                     
##  [2] "european union (current composition) and iceland under the kyoto protocol"
##  [3] "belgium"                                                                  
##  [4] "bulgaria"                                                                 
##  [5] "czech republic"                                                           
##  [6] "denmark"                                                                  
##  [7] "germany (until 1990 former territory of the frg)"                         
##  [8] "estonia"                                                                  
##  [9] "ireland"                                                                  
## [10] "greece"
# joining characters
str_c (geo[26 : 35], collapse = ", ")
## [1] "Slovenia, Slovakia, Finland, Sweden, United Kingdom, Iceland, Liechtenstein, Norway, Switzerland, Turkey"
str_c (geo[ 26 : 35], 1:10, sep = "_" )
##  [1] "Slovenia_1"       "Slovakia_2"       "Finland_3"        "Sweden_4"        
##  [5] "United Kingdom_5" "Iceland_6"        "Liechtenstein_7"  "Norway_8"        
##  [9] "Switzerland_9"    "Turkey_10"
# extracting characters between two indices
str_sub(geo[ 26 : 35], 1, 3)
##  [1] "Slo" "Slo" "Fin" "Swe" "Uni" "Ice" "Lie" "Nor" "Swi" "Tur"
# str_ and regex
str_subset(geo, "[p]")
## [1] "European Union (current composition)"                                     
## [2] "European Union (current composition) and Iceland under the Kyoto Protocol"
## [3] "Czech Republic"                                                           
## [4] "Spain"                                                                    
## [5] "Cyprus"
str_subset(geo, "[pl]")
##  [1] "European Union (current composition)"                                     
##  [2] "European Union (current composition) and Iceland under the Kyoto Protocol"
##  [3] "Belgium"                                                                  
##  [4] "Bulgaria"                                                                 
##  [5] "Czech Republic"                                                           
##  [6] "Germany (until 1990 former territory of the FRG)"                         
##  [7] "Ireland"                                                                  
##  [8] "Spain"                                                                    
##  [9] "Italy"                                                                    
## [10] "Cyprus"                                                                   
## [11] "Malta"                                                                    
## [12] "Netherlands"                                                              
## [13] "Poland"                                                                   
## [14] "Portugal"                                                                 
## [15] "Slovenia"                                                                 
## [16] "Slovakia"                                                                 
## [17] "Finland"                                                                  
## [18] "Iceland"                                                                  
## [19] "Switzerland"
str_subset(geo, "^E")
## [1] "European Union (current composition)"                                     
## [2] "European Union (current composition) and Iceland under the Kyoto Protocol"
## [3] "Estonia"
str_subset(geo, "E$")
## character(0)
# counting
str_count(geo, "[ou]")
##  [1]  7 13  1  1  1  0  4  1  0  0  0  0  1  0  1  0  1  3  1  0  0  1  1  2  1
## [26]  1  1  0  0  1  0  0  1  0  1
# detecting a pattern
str_detect(geo, "^L")
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
# extracting a pattern
str_extract(geo, "Euro")
##  [1] "Euro" "Euro" NA     NA     NA     NA     NA     NA     NA     NA    
## [11] NA     NA     NA     NA     NA     NA     NA     NA     NA     NA    
## [21] NA     NA     NA     NA     NA     NA     NA     NA     NA     NA    
## [31] NA     NA     NA     NA     NA
str_extract(geo, "[euro]")
##  [1] "u" "u" "e" "u" "e" "e" "e" "o" "r" "r" NA  "r" "r" NA  "r" NA  "u" "u" "u"
## [20] NA  "e" "u" "o" "o" "o" "o" "o" NA  "e" "e" "e" "e" "o" "e" "u"
# replacing characters
str_replace(geo, "\\(current composition\\)" , "")
##  [1] "European Union "                                     
##  [2] "European Union  and Iceland under the Kyoto Protocol"
##  [3] "Belgium"                                             
##  [4] "Bulgaria"                                            
##  [5] "Czech Republic"                                      
##  [6] "Denmark"                                             
##  [7] "Germany (until 1990 former territory of the FRG)"    
##  [8] "Estonia"                                             
##  [9] "Ireland"                                             
## [10] "Greece"                                              
## [11] "Spain"                                               
## [12] "France"                                              
## [13] "Croatia"                                             
## [14] "Italy"                                               
## [15] "Cyprus"                                              
## [16] "Latvia"                                              
## [17] "Lithuania"                                           
## [18] "Luxembourg"                                          
## [19] "Hungary"                                             
## [20] "Malta"                                               
## [21] "Netherlands"                                         
## [22] "Austria"                                             
## [23] "Poland"                                              
## [24] "Portugal"                                            
## [25] "Romania"                                             
## [26] "Slovenia"                                            
## [27] "Slovakia"                                            
## [28] "Finland"                                             
## [29] "Sweden"                                              
## [30] "United Kingdom"                                      
## [31] "Iceland"                                             
## [32] "Liechtenstein"                                       
## [33] "Norway"                                              
## [34] "Switzerland"                                         
## [35] "Turkey"
geo2 <- str_c(geo[26 : 35], 1 :10, sep="_")
geo2
##  [1] "Slovenia_1"       "Slovakia_2"       "Finland_3"        "Sweden_4"        
##  [5] "United Kingdom_5" "Iceland_6"        "Liechtenstein_7"  "Norway_8"        
##  [9] "Switzerland_9"    "Turkey_10"
str_replace(geo2, "_[0-9]{1,2}", "")
##  [1] "Slovenia"       "Slovakia"       "Finland"        "Sweden"        
##  [5] "United Kingdom" "Iceland"        "Liechtenstein"  "Norway"        
##  [9] "Switzerland"    "Turkey"
# creating the format "leading zero"
str_pad(1:12, 2, "left", "0")
##  [1] "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" "11" "12"

8 Factors and Levels

8.1 Base-R

Conceptually, factors are variables in R which take on a limited number of different values; such variables are often referred to as categorical variables. One of the most important uses of factors is in statistical modeling; since categorical variables enter into statistical models differently than continuous variables, storing data as factors insures that the modeling functions will treat such data correctly.

Factors in R are stored as a vector of integer values with a corresponding set of character values to use when the factor is displayed. The factor function is used to create a factor. The only required argument to factor is a vector of values which will be returned as a vector of factor values. Both numeric and character variables can be made into factors, but a factor’s levels will always be character values. You can see the possible levels for a factor by calling the levels function; the nlevels function will return the number of levels of a factor.

To change the order in which the levels will be displayed from their default sorted order, the levels= argument can be given a vector of all the possible values of the variable in the order you desire. If the ordering should also be used when performing comparisons, use the optional ordered=TRUE argument.In this case, the factor is known as an ordered factor.

The levels of a factor are used when displaying the factor’s values. You can change these levels at the time you create a factor by passing a vector with the new values through the labels= argument. Note that this actually changes the internal levels of the factor, and to change the labels of a factor after it has been created, the assignment form of the levels function is used.

Factors represent a very efficient way to store character values, because each unique character value is stored only once, and the data itself is stored as a vector of integers. Because of this, read.table will automatically convert character variables to factors unless the as.is=TRUE or stringsAsFactors=FALSE arguments are specified, or the stringsAsFactors system option is set to FALSE. Comparison operators are not supported for unordered factors. The order in which the levels are displayed is determined by the order in which they appear in the levels= argument to factor. Sometimes, a factor needs to be reordered on the basis of some property of that factor. The reorder function takes three arguments: a factor, a vector of values on which the reordering is based, and a function to operate on those values for each factor level. When reorder is used, it assigns an attribute called scores which contains the value used for the reordering.

For some statistical procedures, the interpretation of results can be simplified by forcing a particular order to a factor; in particular, it may be useful to choose a “reference” level, which should be the first level of the factor. The relevel function allows you to choose a reference level, which will then be treated as the first level of the factor.

While it may be necessary to convert a numeric variable to a factor for a particular application, it is often very useful to convert the factor back to its original numeric values, since even simple arithmetic operations will fail when using factors. Since the as.numeric function will simply return the internal integer values of the factor, the conversion must be done using the levels attribute of the factor, or by first converting the factor to a character value using as.character.

When a factor is first created, all of its levels are stored along with the factor, and if subsets of the factor are extracted, they will retain all of the original levels. This can create problems when constructing model matrices and may or may not be useful when displaying the data using, say, the table function. To change this, we can use the drop=TRUE argument to the subscripting operator. When used with factors, this argument will remove the unused levels.

To exclude certain levels from appearing in a factor, the exclude= argument can be passed to factor. By default, the missing value (NA) is excluded from factor levels; to create a factor that includes missing values from a numeric variable, use exclude=NULL.

Care must be taken when combining variables which are factors, because the c function will interpret the factors as integers. To combine factors, they should first be converted back to their original values (through the levels function), then concatenated and converted to a new factor.

The cut function is used to convert a numeric variable into a factor. The breaks= argument to cut is used to describe how ranges of numbers will be converted to factor values. If a number is provided through the breaks= argument, the resulting factor will be created by dividing the range of the variable into that number of equal-length intervals; if a vector of values is provided, the values in the vector are used to determine the breakpoints. Note that if a vector of values is provided, the number of levels of the resultant factor will be one less than the number of values in the vector. Notice that the default label for factors produced by cut contains the actual range of values that were used to divide the variable into factors. The pretty function can be used to choose cut points that are round numbers, but it may not return the number of levels that’s actually desired. The labels= argument to cut allows you to specify the levels of the factors. To produce factors based on percentiles of your data (for example, quartiles or deciles), the quantile function can be used to generate the breaks= argument, insuring nearly equal numbers of observations in each of the levels of the factor.

If you wish to create a factor based on one of the components of that date, you can extract it with strftime and convert it to a factor directly. Sometimes more flexibility can be achieved by using the cut function, which understands time units of months, days, weeks, and years through the breaks= argument. (For date/time values, units of hours, minutes, and seconds can also be used.). By default, cut starts weeks on Mondays; to use Sundays instead, pass the start.on.monday=FALSE argument to cut. Multiples of units can also be specified through the breaks= argument.

Sometimes it is useful to treat all combinations of several factors as if they were a single factor. In situations like these, the interaction function can be used. This function will take two or more factors, and create a new, unordered factor whose levels correspond to the combinations of the levels of the input factors. interaction’s default behavior is to include all possible combinations of its input factors. To retain only those combinations for which there were observations, the drop=TRUE argument can be passed to interaction. By default, interaction forms levels for the new factor by joining the levels of its component factors with a period (.). This can be overridden with the sep= argument.

data <- c(1, 2, 2, 3, 1, 2, 3, 3, 1, 2, 3, 3, 1)
fdata <- factor(data)
fdata
##  [1] 1 2 2 3 1 2 3 3 1 2 3 3 1
## Levels: 1 2 3
# modifying levels in factors
rdata <- factor(data, labels = c("I", "II", "III"))
rdata
##  [1] I   II  II  III I   II  III III I   II  III III I  
## Levels: I II III
levels(fdata) <- c("I", "II", "III")
fdata
##  [1] I   II  II  III I   II  III III I   II  III III I  
## Levels: I II III
# unordered factor
mons <- c("March", "April", "January", "November", "January", "September", "October", "September", "November", "August", "January", "November", "November", "February", "May", "August", "July", "December", "August", "August", "September", "November", "February", "April")
mons <- factor(mons)
table(mons)
## mons
##     April    August  December  February   January      July     March       May 
##         2         4         1         2         3         1         1         1 
##  November   October September 
##         5         1         3
# ordered factor
mons <- factor(mons, levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"), ordered = TRUE)
mons[1] < mons[2]
## [1] TRUE
table(mons)
## mons
##   January  February     March     April       May      June      July    August 
##         3         2         1         2         1         0         1         4 
## September   October  November  December 
##         3         1         5         1
# reordering levels
levels(InsectSprays$spray)
## [1] "A" "B" "C" "D" "E" "F"
InsectSprays$spray <- with(InsectSprays, reorder(spray, count, mean))
levels(InsectSprays$spray)
## [1] "C" "E" "D" "A" "B" "F"
attr(InsectSprays$spray, "scores")
##         A         B         C         D         E         F 
## 14.500000 15.333333  2.083333  4.916667  3.500000 16.666667
# forcing a particular order to a factor with relevel
levels(InsectSprays$spray)
## [1] "C" "E" "D" "A" "B" "F"
InsectSprays$spray <- relevel(InsectSprays$spray, "A")
levels(InsectSprays$spray)
## [1] "A" "C" "E" "D" "B" "F"
# converting factors to numeric
fert <- c(10, 20, 20, 50, 10, 20, 10, 50, 20)
fert <- factor(fert, levels = c(10, 20, 50), ordered = TRUE)
fert
## [1] 10 20 20 50 10 20 10 50 20
## Levels: 10 < 20 < 50
mean(as.numeric(levels(fert)[fert]))
## [1] 23.33333
mean(as.numeric(as.character(fert)))
## [1] 23.33333
# dropping unused levels
lets <- sample(letters, size = 100, replace = TRUE)
lets <- factor(lets)
table(lets[1:5])
## 
## a c d e f g h i j k l m n o p q r s t u v w x y z 
## 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0
table(lets[1:5, drop = TRUE])
## 
## c d l p s 
## 1 1 1 1 1
# or
table(factor(lets[1:5]))
## 
## c d l p s 
## 1 1 1 1 1
# combining factors
fact1 <- factor(sample(letters, size = 10, replace = TRUE))
fact2 <- factor(sample(letters, size = 10, replace = TRUE))
fact12 <- factor(c(levels(fact1)[fact1], levels(fact2)[fact2]))
fact12
##  [1] g z l j m f q j u x u w f k v g z x m y
## Levels: f g j k l m q u v w x y z
# creating factors from continuous variables
wfact <- cut(women$weight, 3)
table(wfact)
## wfact
## (115,131] (131,148] (148,164] 
##         6         5         4
wfact <- cut(women$weight, pretty(women$weight, 3))
wfact
##  [1] (100,120] (100,120] (100,120] (120,140] (120,140] (120,140] (120,140]
##  [8] (120,140] (120,140] (140,160] (140,160] (140,160] (140,160] (140,160]
## [15] (160,180]
## Levels: (100,120] (120,140] (140,160] (160,180]
wfact <- cut(women$weight, 3, labels = c("Low", "Medium", "High"))
table(wfact)
## wfact
##    Low Medium   High 
##      6      5      4
wfact <- cut(women$weight, quantile(women$weight, (0:4) / 4))
table(wfact)
## wfact
## (115,124] (124,135] (135,148] (148,164] 
##         3         4         3         4
# creating factors based on dates and times
everyday <- seq(from = as.Date("2005-1-1"), to = as.Date("2005-12-31"), by = "day")
cmonth <- format(everyday, "%b")
months <- factor(cmonth, levels = unique(cmonth), ordered = TRUE)
table(months)
## months
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec 
##  31  28  31  30  31  30  31  31  30  31  30  31
# with cut
wks <- cut(everyday, breaks = "week")
head(wks)
## [1] 2004-12-27 2004-12-27 2005-01-03 2005-01-03 2005-01-03 2005-01-03
## 53 Levels: 2004-12-27 2005-01-03 2005-01-10 2005-01-17 ... 2005-12-26
qtrs <- cut(everyday, "3 months", labels = paste("Q", 1:4, sep = ""))
head(qtrs)
## [1] Q1 Q1 Q1 Q1 Q1 Q1
## Levels: Q1 Q2 Q3 Q4
# interactions
nlevels(CO2$Plant)
## [1] 12
nlevels(CO2$Type)
## [1] 2
newfact <- interaction(CO2$Plant, CO2$Type)
nlevels(newfact)
## [1] 24
newfact1 <- interaction(CO2$Plant, CO2$Type, drop = TRUE, sep = "_")
nlevels(newfact1)
## [1] 12
levels(newfact1)
##  [1] "Qn1_Quebec"      "Qn2_Quebec"      "Qn3_Quebec"      "Qc1_Quebec"     
##  [5] "Qc3_Quebec"      "Qc2_Quebec"      "Mn3_Mississippi" "Mn2_Mississippi"
##  [9] "Mn1_Mississippi" "Mc2_Mississippi" "Mc3_Mississippi" "Mc1_Mississippi"
# converting multiple numeric variables to factor (no data)
# cols <- c(35:74)
# df.j[, cols] <- lapply(df.j[, cols], factor)

# assigning levels to factor variables (no data)
# btw2009 <- within (btw2009, levels(stateA) <- c("BW", "BY", "BE","BB", "HB", "HH", "HE", "MV", "NI", "NW","RP", "SL", "SN", "ST", "SH", "TH"))

# re-ordering levels and assigning to a new factor variable
# ls <- with(btw9s, Bundesland[order(EW, -Voters)]) # reorder levels and supress one var
# btw9s <- within(btw9s, State1 <- factor(Bundesland, levels=ls)) # create a new factor var and assign levels

# re-assigning ordered levels to variables in a data frame 
levels(Arthritis$Improved)
## [1] "None"   "Some"   "Marked"
Arthritis$Improved <- ordered(Arthritis$Improved, levels = c("None", "Some", "Marked"))
# or
# exp1_long$condition <- factor(exp1_long$condition, levels = c("No_communication", "High_confidence", "Medium_confidence", "Low_confidence"))

# in table form there are occasions when you need numeric values for the levels of ordered factors in a table (no data)
# Simply re-assign the dimnames attribute of the table variables
# dimnames(JobSat)$income <- c(7.5, 20, 32.5, 60)
# dimnames(JobSat)$satisfaction <- 1:4

# You want to preserve the character labels of table dimensions, but also
# allow them to be sorted in some particular order
# dimnames(JobSat)$income <- paste(1:4, dimnames(JobSat)$income, sep = ":")
# dimnames(JobSat)$satisfaction <- paste(1:4, dimnames(JobSat)$satisfaction, sep = ":")

8.2 Tidyverse

A factor is a data structure in R that allows you to create a set of categories. We call these categories levels. It is well known in the psychological literature that we can only store a certain number of things in our working memory. Therefore, to help people make sense of categories, we shouldn’t show too many of them. This is where the strength of lumping factors shows. Lumping is nothing more than combining factor levels into a new and larger category. Factor variables are very useful but not very easy to manipulate. forcats contains very useful functions that make working on factor variables painless. The four following functions, fct_recode(), fct_relevel(), fct_reorder() and fct_relabel(), are the ones you must know. fct_reorder() is especially useful for plotting. fct_lump*() functions make it possible to lump several levels of a factor into a new other level. The tidyverse team no longer recommends the use of this function. Instead, we can use the new functions created in 2020:

  • fct_lump_min: lump levels that do not occur more than min times.
  • fct_lump_n: lumps n of the most or least frequently occurring levels.
  • fct_lump_prop: lumps of levels that occur at most n times * prop.
  • fct_lump_lowfreq: lumps the least frequent levels.

fct_lump_min summarizes all levels that do not appear more than min times. Compared to fct_lump_min, fct_lump_n is not about the number of levels. Instead, it simply keeps the most frequent levels or the least frequent levels. We have to decide what to do with the levels that occur the same number of times. If you don’t give the function any additional information, fct_lump_n will show you all the levels whose number falls below the last level, which is clearly one of the most frequent levels. You can change this behavior with the ties.method argument. The default argument is min. The other options are “average”, “first”, “last”, “random” and “max”.

fct_lump_prop represents the percentage at which a particular level occurs within the total number of levels.

Some of these four functions cause the Other level not to be the least common level. fct_lump_lowfreq simply ensures that so many levels are grouped together that the “Other” is still the least frequent level. It has no additional arguments except other_level, which is used to specify the name of the “other” level.

A factor is an ordered data structure in R. What is ordered in a factor are its levels. Usually, you need to know the order of levels when you try to visualize factor columns. Let’s say you want to order the bars in your bar chart. Or you want to order the facets in your chart. Or you want to order the lines in your line chart. We can change the order of the factor levels manually by using the function fct_relevel. The first argument is the factor column. Then you list the levels you want to place first. You could also do this with a vector. By default factor levels are ordered by alphabetical order. We can place factor levels wherever we like with the after argument. Your levels will be placed at the nth + 1 position of this number.

With the function fct_infreq we can change the order according to how frequent each level occurs. If you want to reverse the order of the levels, you can use the fct_rev function.

The function fct_reorder allows to order the levels based on another continuous variable. We can reverse the order in two ways. First, by setting the .desc argument to TRUE or with the function fct_rev. The .fun argument is set to the function median and it is useful when you have many values for each factor level. This is always the case when you create a boxplot or a violin diagram. The function fct_reorder2 finds the largest values of one variable at the largest value of another variable. We can reverse the order by setting the .fun argument to first2.

head(gss_cat)
## # A tibble: 6 × 9
##    year marital         age race  rincome        partyid     relig denom tvhours
##   <int> <fct>         <int> <fct> <fct>          <fct>       <fct> <fct>   <int>
## 1  2000 Never married    26 White $8000 to 9999  Ind,near r… Prot… Sout…      12
## 2  2000 Divorced         48 White $8000 to 9999  Not str re… Prot… Bapt…      NA
## 3  2000 Widowed          67 White Not applicable Independent Prot… No d…       2
## 4  2000 Never married    39 White Not applicable Ind,near r… Orth… Not …       4
## 5  2000 Divorced         25 White Not applicable Not str de… None  Not …       1
## 6  2000 Married          25 White $20000 - 24999 Strong dem… Prot… Sout…      NA
str(gss_cat$marital)
##  Factor w/ 6 levels "No answer","Never married",..: 2 4 5 2 4 6 2 4 6 6 ...
str(gss_cat$rincome)
##  Factor w/ 16 levels "No answer","Don't know",..: 8 8 16 16 16 5 4 9 4 4 ...
# recoding levels
gss_cat <- gss_cat %>%
  mutate(marital = fct_recode(marital,
    refuse = "No answer",
    never_married = "Never married",
    divorced = "Separated",
    divorced = "Divorced",
    widowed = "Widowed",
    married = "Married"
  ))

gss_cat %>%
  janitor::tabyl(marital)
##        marital     n      percent
##         refuse    17 0.0007913234
##  never_married  5416 0.2521063166
##       divorced  4126 0.1920588372
##        widowed  1807 0.0841130196
##        married 10117 0.4709305032
# lumping categories with the old not-recommend fct_lump() function
gss_cat <- gss_cat %>%
  mutate(marital = fct_lump(marital, prop = 0.10, other_level = "other"))

gss_cat %>%
  mutate(
    # Description of the different functions taken from help(fct_lump)
    denom_lowfreq = fct_lump_lowfreq(denom), # lumps together the least frequent levels, ensuring that "other" is still the smallest level.
    denom_min = fct_lump_min(denom, min = 10), # lumps levels that appear fewer than min times.
    denom_n = fct_lump_n(denom, n = 3), # lumps all levels except for the n most frequent (or least frequent if n < 0)
    denom_prop = fct_lump_prop(denom, prop = 0.10) # lumps levels that appear in fewer prop * n times.
  )
## # A tibble: 21,483 × 13
##     year marital   age race  rincome partyid relig denom tvhours denom…¹ denom…²
##    <int> <fct>   <int> <fct> <fct>   <fct>   <fct> <fct>   <int> <fct>   <fct>  
##  1  2000 never_…    26 White $8000 … Ind,ne… Prot… Sout…      12 Southe… Southe…
##  2  2000 divorc…    48 White $8000 … Not st… Prot… Bapt…      NA Baptis… Baptis…
##  3  2000 other      67 White Not ap… Indepe… Prot… No d…       2 No den… No den…
##  4  2000 never_…    39 White Not ap… Ind,ne… Orth… Not …       4 Not ap… Not ap…
##  5  2000 divorc…    25 White Not ap… Not st… None  Not …       1 Not ap… Not ap…
##  6  2000 married    25 White $20000… Strong… Prot… Sout…      NA Southe… Southe…
##  7  2000 never_…    36 White $25000… Not st… Chri… Not …       3 Not ap… Not ap…
##  8  2000 divorc…    44 White $7000 … Ind,ne… Prot… Luth…      NA Luther… Luther…
##  9  2000 married    44 White $25000… Not st… Prot… Other       0 Other   Other  
## 10  2000 married    47 White $25000… Strong… Prot… Sout…       3 Southe… Southe…
## # … with 21,473 more rows, 2 more variables: denom_n <fct>, denom_prop <fct>,
## #   and abbreviated variable names ¹​denom_lowfreq, ²​denom_min
gss_cat %>%
  tabyl(marital)
##        marital     n    percent
##  never_married  5416 0.25210632
##       divorced  4126 0.19205884
##        married 10117 0.47093050
##          other  1824 0.08490434
# another example
billboard %>%
  ggplot(aes(y = artist)) +
  geom_bar()

billboard %>%
  mutate(artist = fct_lump(as_factor(artist), 10)) %>%
  filter(artist != "Other") %>%
  ggplot(aes(y = artist)) +
  geom_bar()

# fct_lump() new functions
# using fct_lump_min()
billboard %>%
  mutate(artist = fct_lump_min(as_factor(artist), 3)) %>%
  filter(artist != "Other") %>%
  ggplot(aes(y = artist)) +
  geom_bar()

# another example
table(gss_cat$rincome)
## 
##      No answer     Don't know        Refused $25000 or more $20000 - 24999 
##            183            267            975           7363           1283 
## $15000 - 19999 $10000 - 14999  $8000 to 9999  $7000 to 7999  $6000 to 6999 
##           1048           1168            340            188            215 
##  $5000 to 5999  $4000 to 4999  $3000 to 3999  $1000 to 2999       Lt $1000 
##            227            226            276            395            286 
## Not applicable 
##           7043
gss_cat %>%
  mutate(rincome = fct_lump_min(rincome, 600)) %>%
  ggplot(aes(y = fct_infreq(rincome))) +
  geom_bar()

# using fct_lump_n()
gss_cat %>%
  mutate(rincome = fct_lump_n(rincome, n = 10)) %>% # lump all levels except the 10 most frequent
  ggplot(aes(y = fct_infreq(rincome))) +
  geom_bar()

billboard %>%
  mutate(artist = fct_lump_n(artist, n = -5)) %>% # lump all the levels that occur most often (exactly the opposite of what a positive number does)
  filter(artist != "Other") %>%
  ggplot(aes(y = artist)) +
  geom_bar()

billboard %>%
  mutate(artist = fct_lump_n(artist, 5, ties.method = "min")) %>%
  filter(artist != "Other") %>%
  ggplot(aes(y = artist)) +
  geom_bar()

billboard %>%
  mutate(artist = fct_lump_n(artist, 5, ties.method = "max")) %>%
  filter(artist != "Other") %>%
  ggplot(aes(y = artist)) +
  geom_bar()

billboard %>%
  mutate(artist = fct_lump_n(artist, 5, ties.method = "random")) %>%
  filter(artist != "Other") %>%
  ggplot(aes(y = artist)) +
  geom_bar()

# fct_lump_prop()
# step by step explanation
# 1. how many times all levels occur in total
(total_count_income <- gss_cat %>% count(rincome) %>%
  {
    sum(.$n)
  }) 
## [1] 21483
# 2. choose a specific income range and how often occurs 
(count_one_range <- gss_cat$rincome[gss_cat$rincome == "$20000 - 24999"] %>%
  length())
## [1] 1283
# 3. calculating the proportion
count_one_range / total_count_income
## [1] 0.05972164
# the same but with fct_lump_prop()
gss_cat %>%
  mutate(rincome = fct_lump_prop(rincome, .05)) %>% # levels that occur in less than 5% of all counts
  ggplot(aes(y = fct_infreq(rincome))) +
  geom_bar()

# checking with tidyverse
gss_cat %>%
  count(rincome, name = "count_per_income_range") %>%
  select(rincome, count_per_income_range) %>%
  mutate(
    total_count_income = sum(count_per_income_range),
    percentage = count_per_income_range / total_count_income
  ) %>%
  filter(percentage >= .05)
## # A tibble: 4 × 4
##   rincome        count_per_income_range total_count_income percentage
##   <fct>                           <int>              <int>      <dbl>
## 1 $25000 or more                   7363              21483     0.343 
## 2 $20000 - 24999                   1283              21483     0.0597
## 3 $10000 - 14999                   1168              21483     0.0544
## 4 Not applicable                   7043              21483     0.328
# using fct_lump_lowfreq)
# without fct_lump_lowfreq()
gss_cat %>%
  mutate(
    rincome = fct_lump_n(rincome, n = 10),
    color_coding_rincome = ifelse(rincome == "Other", "a", "b")
  ) %>%
  ggplot(aes(
    y = fct_infreq(rincome),
    fill = color_coding_rincome
  )) +
  scale_fill_manual(values = c("grey20", "grey80")) +
  geom_bar(show.legend = FALSE)

# with fct_lump_lowfreq()
gss_cat %>%
  mutate(
    rincome = fct_lump_lowfreq(rincome),
    color_coding_rincome = ifelse(rincome == "Other", "a", "b")
  ) %>%
  ggplot(aes(
    y = fct_infreq(rincome),
    fill = color_coding_rincome
  )) +
  scale_fill_manual(values = c("grey20", "grey80")) +
  geom_bar(show.legend = FALSE)

# using fct_reorder()
gss_cat %>%
  tabyl(marital) %>%
  ggplot() +
  geom_col(aes(y = n, x = marital)) +
  coord_flip()

gss_cat %>%
  tabyl(marital) %>%
  mutate(marital = fct_reorder(marital, n, .desc = FALSE)) %>%
  ggplot() +
  geom_col(aes(y = n, x = marital)) +
  coord_flip()

# order factor levels manually
# getting the data
msleep %>%
  filter(name %in% c("Cow", "Dog", "Tiger", "Chimpanzee")) %>%
  ggplot(aes(x = name, y = sleep_total)) +
  geom_col()

# using fct_relevel()
msleep %>%
  dplyr::filter(name %in% c("Cow", "Dog", "Tiger", "Chimpanzee")) %>%
  mutate(
    name = fct_relevel(name, c("Cow", "Dog"))
  ) %>%
  ggplot(aes(x = name, y = sleep_total)) +
  geom_col()

msleep %>%
  filter(name %in% c("Cow", "Dog", "Tiger", "Chimpanzee")) %>%
  mutate(
    name = fct_relevel(name, "Cow", after = 3)
  ) %>%
  ggplot(aes(x = name, y = sleep_total)) +
  geom_col()

# how to order the levels based on how frequently each level occurs
# getting the data
mpg %>%
  ggplot(aes(x = manufacturer)) +
  geom_bar() +
  scale_x_discrete(guide = guide_axis(n.dodge = 2))

# using fct_infreq()
mpg %>%
  mutate(manufacturer = fct_infreq(manufacturer)) %>%
  ggplot(aes(x = manufacturer)) +
  geom_bar() +
  scale_x_discrete(guide = guide_axis(n.dodge = 2))

# in reverse order
mpg %>%
  mutate(manufacturer = fct_infreq(manufacturer) %>% fct_rev()) %>%
  ggplot(aes(x = manufacturer)) +
  geom_bar() +
  scale_x_discrete(guide = guide_axis(n.dodge = 2))

# how to order the levels based on the values of a numeric variable
# getting the data
(sleep_data <- msleep %>%
  filter(name %in% c("Cow", "Dog", "Tiger", "Chimpanzee")) %>%
  select(name, sleep_total))
## # A tibble: 4 × 2
##   name       sleep_total
##   <chr>            <dbl>
## 1 Cow                4  
## 2 Dog               10.1
## 3 Chimpanzee         9.7
## 4 Tiger             15.8
msleep %>%
  filter(name %in% c("Cow", "Dog", "Tiger", "Chimpanzee")) %>%
  ggplot(aes(x = name, y = sleep_total)) +
  geom_col()

# using fct_reorder()
sleep_data %>%
  mutate(
    name = as.factor(name) %>% fct_reorder(sleep_total)
  ) %>%
  pull(name)
## [1] Cow        Dog        Chimpanzee Tiger     
## Levels: Cow Chimpanzee Dog Tiger
msleep %>%
  filter(name %in% c("Cow", "Dog", "Tiger", "Chimpanzee")) %>%
  mutate(
    name = as.factor(name) %>% fct_reorder(sleep_total)
  ) %>%
  ggplot(aes(x = name, y = sleep_total)) +
  geom_col()

# reversing the order
msleep %>%
  filter(name %in% c("Cow", "Dog", "Tiger", "Chimpanzee")) %>%
  mutate(
    name = as.factor(name) %>%
      fct_reorder(sleep_total, .desc = TRUE)
  ) %>%
  ggplot(aes(x = name, y = sleep_total)) +
  geom_col()

# or
msleep %>%
  filter(name %in% c("Cow", "Dog", "Tiger", "Chimpanzee")) %>%
  mutate(
    name = as.factor(name) %>%
      fct_reorder(sleep_total) %>%
      fct_rev()
  ) %>%
  ggplot(aes(x = name, y = sleep_total)) +
  geom_col()

# using the argument .fun = into de fct_reorder()
mpg %>%
  ggplot(aes(x = displ, y = manufacturer)) +
  geom_boxplot()

# by default .fun is set to median
mpg %>%
  mutate(
    manufacturer = fct_reorder(as.factor(manufacturer), displ)
  ) %>%
  ggplot(aes(x = displ, y = manufacturer)) +
  geom_boxplot()

mpg %>%
  mutate(
    manufacturer = fct_reorder(
      as.factor(manufacturer),
      displ
    )
  ) %>%
  ggplot(aes(x = displ, y = manufacturer)) +
  geom_boxplot() +
  stat_summary(geom = "point", fun = "median", color = "blue") +
  stat_summary(geom = "line", fun = "median", color = "blue", group = 1)

# setting .fun to max
mpg %>%
  mutate(
    manufacturer = fct_reorder(as.factor(manufacturer),
      displ,
      .fun = max
    )
  ) %>%
  ggplot(aes(x = displ, y = manufacturer)) +
  geom_boxplot() +
  stat_summary(geom = "point", fun = "max", color = "blue") +
  stat_summary(geom = "line", fun = "max", color = "blue", group = 1)

# setting .fun to min
mpg %>%
  mutate(
    manufacturer = fct_reorder(as.factor(manufacturer),
      displ,
      .fun = min
    )
  ) %>%
  ggplot(aes(x = displ, y = manufacturer)) +
  geom_boxplot() +
  stat_summary(geom = "point", fun = "min", color = "blue") +
  stat_summary(geom = "line", fun = "min", color = "blue", group = 1)

# setting the .fun to the range between the maximum and the minimum values
mpg %>%
  mutate(
    manufacturer = fct_reorder(as.factor(manufacturer),
      displ,
      .fun = function(x) max(x) - min(x)
    )
  ) %>%
  ggplot(aes(x = displ, y = manufacturer)) +
  geom_boxplot()

# with facets
population %>%
  filter(country %in% c(
    "Afghanistan", "Germany", "Brazil",
    "Canada"
  )) %>%
  mutate(country = fct_reorder(country,
    population,
    .fun = function(x) min(x) - max(x)
  )) %>%
  ggplot(aes(x = year, y = population)) +
  geom_line() +
  facet_wrap(vars(country))

# how to order levels based on the values of two numeric variables
# getting the data
marital_status_per_age <- gss_cat %>%
  count(age, marital) %>%
  group_by(age) %>%
  mutate(
    prop = n / sum(n)
  ) %>%
  ungroup()

marital_status_per_age %>%
  ggplot(aes(x = age, y = prop, color = marital)) +
  stat_summary(geom = "line", fun = mean)

# using fct_reorder2()
marital_status_per_age %>%
  group_by(marital) %>%
  slice_max(age) %>%
  ungroup() %>%
  arrange(desc(prop))
## # A tibble: 4 × 4
##     age marital           n   prop
##   <int> <fct>         <int>  <dbl>
## 1    89 other           108 0.730 
## 2    89 married          25 0.169 
## 3    89 divorced          9 0.0608
## 4    89 never_married     6 0.0405
marital_status_per_age %>%
  mutate(
    marital = as.factor(marital) %>%
      fct_reorder2(age, prop)
  ) %>%
  ggplot(aes(x = age, y = prop, color = marital)) +
  stat_summary(geom = "line", fun = mean)

# reversing the order
marital_status_per_age %>%
  mutate(
    marital = as.factor(marital) %>%
      fct_reorder2(age, prop, .fun = first2)
  ) %>%
  ggplot(aes(x = age, y = prop, color = marital)) +
  stat_summary(geom = "line", fun = mean) +
  theme(
    legend.position = "left"
  )

9 Dates and Times

9.1 Base-R

R provides several options for dealing with date and date/time data. The builtin as.Date function handles dates (without times); the contributed package chron handles dates and times, but does not control for time zones; and the POSIXct and POSIXlt classes allow for dates and times with control for time zones. The general rule for date/time data in R is to use the simplest technique possible. Thus, for date only data, as.Date will usually be the best choice. If you need to handle dates and times, without time-zone information, the chron package is a good choice; the POSIX classes are especially useful when time-zone manipulation is important. Also, don’t overlook the various “as.” functions (like as.Date and as.POSIXlt) for converting among the different date types when necessary. Except for the POSIXlt class, dates are stored internally as the number of days or seconds from some reference date. Thus, dates in R will generally have a numeric mode, and the class function can be used to find the way they are actually being stored. The POSIXlt class stores date/time values as a list of components (hour, min, sec, mon, etc.) making it easy to extract these parts. To get the current date, the Sys.Date function will return a Date object which can be converted to a different class if necessary.

The as.Date function allows a variety of input formats through the format= argument. The default format is a four-digit year, followed by a month, then a day, separated by either dashes or slashes. If your input dates are not in the standard format, a format string can be composed using the elements shown in the following table.

Format codes for dates Internally, Date objects are stored as the number of days since January 1, 1970, using negative numbers for earlier dates. The as.numeric function can be used to convert a Date object to its internal form. To convert this form back to a Date object, it can be assigned a class of Date directly. To extract the components of the dates, the weekdays, months, days, or quarters functions can be used.

The chron function converts dates and times to chron objects. The dates and times are provided to the chron function as separate values, so some preprocessing may be necessary to prepare input date/times for the chron function. When using character values, the default format for dates is the decimal month value followed by the decimal day value followed by the year, using the slash as a separator. Alternatively, dates can be specified by a numeric value, representing the number of days since January 1, 1970. To input dates stored as the day of the year, the origin= argument can be used to interpret numeric dates relative to a different date. The default format for times consists of the hour, minutes, and seconds, separated by colons. Often the first task when using the chron package is to break apart the date and times if they are stored together. Chron values are stored internally as the fractional number of days from January 1, 1970. The as.numeric function can be used to access the internal values.

Format codes for chron objects POSIX represents a portable operating system interface, primarily for UNIX systems, but available on other operating systems as well. Dates stored in the POSIX format are date/time values (like dates with the chron package), but also allow modification of time zones. Unlike the chron package, which stores times as fractions of days, the POSIX date classes store times to the nearest second, so they provide a more accurate representation of times. There are two POSIX date/time classes, which differ in the way that the values are stored internally. The POSIXct class stores date/time values as the number of seconds since January 1, 1970, while the POSIXlt class stores them as a list with elements for second, minute, hour, day, month, and year, among others. Unless you need the list nature of the POSIXlt class, the POSIXct class is the usual choice for storing dates in R. The default input format for POSIX dates consists of the year, followed by the month and day, separated by slashes or dashes; for date/time values, the date may be followed by white space and a time in the form hour:minutes:seconds or hour:minutes. Valid POSIX date or date/time inputs:

  • 1915/6/16
  • 2005-06-24 11:25
  • 1990/2/17 12:20:05

If your input date/times are stored as the number of seconds from January 1, 1970, you can create POSIX date values by assigning the appropriate class directly to those values. Since many date manipulation functions refer to the POSIXt pseudo-class, be sure to include it in the class attribute of the values.

The POSIX date/time classes take advantage of the POSIX date/time implementation of your operating system, allowing dates and times in R to be manipulated in the same way they would be in, for example, a C program. The two most important functions in this regard are strptime, for inputting dates, and strftime, for formatting dates for output. Both of these functions use a variety of formatting codes. Nonformat characters (like the slashes) are interpreted literally. When using strptime, an optional time zone can be specified with the tz= option.

Since POSIX date/time values are stored internally as the number of seconds since January 1, 1970, they can easily use times that are not represented by a formatted version of the hour, minute, and second. Another way to create POSIX dates is to pass the individual components of the time to the ISOdate function. ISOdate will accept both numeric and character arguments.

For formatting dates for output, the format function will recognize the type of your input date, and perform any necessary conversions before calling strftime, so strftime rarely needs to be called directly.

When using POSIX dates, the optional usetz=TRUE argument to the format function can be specified to indicate that the time zone should be displayed. Additionally, as.POSIXlt and as.POSIXct can also accept Date or chron objects, so they can be input as described in the previous sections and converted as needed. Conversion between the two POSIX forms is also possible. The individual components of a POSIX date/time object can be extracted by first converting to POSIXlt if necessary, and then accessing the components directly.

Many of the statistical summary functions, like mean, min, max, etc are able to transparently handle date objects. If two times (using any of the date or date/time classes) are subtracted, R will return the result in the form of a time difference, which represents a difftime object. If an alternative unit of time was desired, the difftime function could be called, using the optional units= argument with any of the following values: “auto”, “secs”, “mins”, “hours”, “days”, or “weeks”.

Although difftime values are displayed with their units, they can be manipulated like ordinary numeric variables; arithmetic performed with these values will retain the original units. To convert a time difference in days to one of years, a good approximation is to divide the number of days by 365.25. However, the difftime value will display the time units as days. To modify this, the units attribute of the object can be modified.

The by= argument to the seq function can be specified either as a difftime value, or in any units of time that the difftime function accepts, making it very easy to generate sequences of dates. All the date classes except for chron will accept an integer before the interval provided as a by= argument. The cut function also understands units of days, weeks, months, and years, making it very easy to create factors grouped by these units.

Format codes can also be used to extract parts of dates, as an alternative to the weekdays and other functions. This same technique can be used to convert dates to factors.

# using as.Date
as.Date("1915-6-16")
## [1] "1915-06-16"
as.Date("1990/02/17")
## [1] "1990-02-17"
as.Date("1/15/2001", format = "%m/%d/%Y")
## [1] "2001-01-15"
as.Date("April 26, 2001", format = "%B %d, %Y")
## [1] "2001-04-26"
as.Date("22JUN01", format = "%d%b%y")
## [1] "2001-06-22"
# converting date to numeric and back to date
thedate <- as.Date("1/15/2001", format = "%m/%d/%Y")
ndate <- as.numeric(thedate)
ndate
## [1] 11337
class(ndate) <- "Date"
ndate
## [1] "2001-01-15"
# using chron
dtimes <- c("2002-06-09 12:45:40", "2003-01-29 09:30:40", "2002-09-04 16:45:40", "2002-11-13 20:00:40", "2002-07-07 17:30:40")
dtparts <- t(as.data.frame(strsplit(dtimes, " ")))
row.names(dtparts) <- NULL
thetimes <- chron(dates = dtparts[, 1], times = dtparts[, 2], format = c("y-m-d", "h:m:s"))
thetimes
## [1] (02-06-09 12:45:40) (03-01-29 09:30:40) (02-09-04 16:45:40)
## [4] (02-11-13 20:00:40) (02-07-07 17:30:40)
# using POSIXlt
dts <- c("2005-10-21 18:47:22", "2005-12-24 16:39:58", "2005-10-28 07:30:05 PDT")
as.POSIXlt(dts)
## [1] "2005-10-21 18:47:22 CEST" "2005-12-24 16:39:58 CET" 
## [3] "2005-10-28 07:30:05 CEST"
# using POSIXct
dts <- c(1127056501, 1104295502, 1129233601, 1113547501, 1119826801, 1132519502, 1125298801, 1113289201)
mydates <- dts
class(mydates) <- c("POSIXt", "POSIXct")
mydates
## [1] "2005-09-18 17:15:01 CEST" "2004-12-29 05:45:02 CET" 
## [3] "2005-10-13 22:00:01 CEST" "2005-04-15 08:45:01 CEST"
## [5] "2005-06-27 01:00:01 CEST" "2005-11-20 21:45:02 CET" 
## [7] "2005-08-29 09:00:01 CEST" "2005-04-12 09:00:01 CEST"
# or
# mydates <- structure(dts, class = c("POSIXt", "POSIXct"))

# using strptime
mydate <- strptime("16/Oct/2005:07:51:00", format = "%d/%b/%Y:%H:%M:%S")
mydate
## [1] "2005-10-16 07:51:00 CEST"
# transforming times that are not represented by a formatted version
mydates <- c("20060515 112504.5", "20060518 101000.3", "20060520 20035.1")
dtparts <- t(as.data.frame(strsplit(mydates, " ")))
dtimes <- strptime(dtparts[, 1], format = "%Y%m%d")
as.numeric(dtparts[, 2])
## [1] 112504.5 101000.3  20035.1
dtimes
## c..20060515....112504.5.. c..20060518....101000.3..  c..20060520....20035.1.. 
##         "2006-05-15 CEST"         "2006-05-18 CEST"         "2006-05-20 CEST"
# using ISOdate
ISOdate(2006, 5, 16, 7, 15, 04, tz = "PDT")
## [1] "2006-05-16 07:15:04 GMT"
# formatting dates for output
thedate <- ISOdate(2005, 10, 21, 18, 47, 22, tz = "PDT")
format(thedate, "%A, %B %d, %Y %H:%M:%S")
## [1] "Friday, October 21, 2005 18:47:22"
# extracting components of POSIX
mydate <- as.POSIXlt("2005-4-19 7:01:00")
names(mydate)
## NULL
mydate$mday
## [1] 19
# operations with dates
b1 <- ISOdate(1977, 7, 13)
b2 <- ISOdate(2003, 8, 14)
b2 - b1
## Time difference of 9528 days
difftime(b2, b1, units = "weeks")
## Time difference of 1361.143 weeks
# rdates <- scan(what = "")
# save(rdates, file = "input/rdates.RData")
load("input/rdates.RData")

rdates <- as.data.frame(matrix(rdates, ncol = 2, byrow = TRUE))
rdates[, 2] <- as.Date(rdates[, 2], format = "%d%b%Y")
names(rdates) <- c("Release", "Date")
rdates
##    Release       Date
## 1      1.0 2000-02-29
## 2      1.1 2000-06-15
## 3      1.2 2000-12-15
## 4      1.3 2001-06-22
## 5      1.4 2001-12-19
## 6      1.5 2002-04-29
## 7      1.6 2002-10-01
## 8      1.7 2003-04-16
## 9      1.8 2003-10-08
## 10     1.9 2004-04-12
## 11     2.0 2004-10-04
mean(rdates$Date)
## [1] "2002-05-19"
range(rdates$Date)
## [1] "2000-02-29" "2004-10-04"
rdates$Date[11] - rdates$Date[1]
## Time difference of 1679 days
ydiff <- (b2 - b1) / 365.25
ydiff
## Time difference of 26.08624 days
attr(ydiff, "units") <- "years"
ydiff
## Time difference of 26.08624 years
# time sequences
seq(as.Date("1976-7-4"), by = "days", length = 10)
##  [1] "1976-07-04" "1976-07-05" "1976-07-06" "1976-07-07" "1976-07-08"
##  [6] "1976-07-09" "1976-07-10" "1976-07-11" "1976-07-12" "1976-07-13"
seq(as.Date("2000-6-1"), to = as.Date("2000-8-1"), by = "2 weeks")
## [1] "2000-06-01" "2000-06-15" "2000-06-29" "2000-07-13" "2000-07-27"
# extracting parts of dates
table(format(rdates$Date, "%A"))
## 
##    Friday    Monday  Thursday   Tuesday Wednesday 
##         2         3         1         2         3
# converting dates to factors
fdate <- factor(format(rdates$Date, "%Y"))
fdate
##  [1] 2000 2000 2000 2001 2001 2002 2002 2003 2003 2004 2004
## Levels: 2000 2001 2002 2003 2004

9.2 Tidyverse

In R, as in most programming languages, there’s a difference between a character string that looks like a date – “2019-06-21” or “June 21, 2019” – and an actual date object with specific methods (class-specific functions) hat only work on dates. A date object can print out as “2019-06-21”, but its behavior will be different from the string version that also prints out as “2019-06-21”. For example, “2019-06-21” + 1 throws an error if “2019-06-21” is a character string, but will return “2019-06-22” for a date. lubridate is yet another tidyverse package, that makes dealing with dates or durations (and intervals) as painless as possible.

There are several helpful functions included in lubridate to convert columns to dates. For instance if the column you want to convert is of the form “2012-11-21”, then you would use the function ymd(), for “year-month-day”. If, however the column is “2012-21-11”, then you would use ydm(). There’s a few of these helper functions, and they can handle a lot of different formats for dates. But you have to be careful with leap years. When a year is not a leap year, the computation returns NA. The same goes for months with a different number of days. The way to solve these issues is to use the special %m+% infix operator.

fechas <- c("1999/12/31", "2000/01/07", "2005/05/20", "2010/03/25")

# converting strings to dates
fechas <- lubridate::ymd(fechas)
class(fechas)
## [1] "Date"
# extracting years
lubridate::year(fechas)
## [1] 1999 2000 2005 2010
# extracting months
lubridate::month(fechas)
## [1] 12  1  5  3
lubridate::month(fechas, label = TRUE)
## [1] Dec Jan May Mar
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
# weekdays
lubridate::wday(fechas)
## [1] 6 6 6 5
lubridate::wday(fechas, label = TRUE)
## [1] Fri Fri Fri Thu
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
# week
lubridate::week(fechas)
## [1] 53  1 20 12
# semester
lubridate::semester(fechas)
## [1] 2 1 1 1
# changing system defaults
sys_time_old <- Sys.getlocale("LC_TIME")
Sys.setlocale("LC_TIME", "Spanish.UTF-8")
## [1] ""
lubridate::month(fechas, label = TRUE)
## [1] Dec Jan May Mar
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
Sys.setlocale("LC_TIME", sys_time_old)
## [1] "C"
# figuring out differences between dates
diff(fechas)
## Time differences in days
## [1]    7 1960 1770
difftime(fechas[3], fechas[1], units = "weeks")
## Time difference of 281 weeks
# sequence
seq(fechas[1], fechas[2], "day")
## [1] "1999-12-31" "2000-01-01" "2000-01-02" "2000-01-03" "2000-01-04"
## [6] "2000-01-05" "2000-01-06" "2000-01-07"
seq(fechas[1], fechas[3], "year")
## [1] "1999-12-31" "2000-12-31" "2001-12-31" "2002-12-31" "2003-12-31"
## [6] "2004-12-31"
# rounding dates
lubridate::round_date(fechas, "month")
## [1] "1999-12-01" "2000-01-01" "2005-05-01" "2010-03-01"
lubridate::round_date(fechas, "year")
## [1] "1999-01-01" "2000-01-01" "2005-01-01" "2010-01-01"
# time
llegada <- lubridate::ymd_hms("2011-06-04 12:25:00", tz = "Europe/Berlin")
salida <- lubridate::ymd_hms("2011-06-05 14:45:00", tz = "Europe/Berlin")
llegada
## [1] "2011-06-04 12:25:00 CEST"
salida
## [1] "2011-06-05 14:45:00 CEST"
lubridate::minute(llegada)
## [1] 25
lubridate::with_tz(llegada, "America/Santiago")
## [1] "2011-06-04 06:25:00 -04"
independence <- readRDS("input/independence.rds")

# converting variables to date objects
independence <- independence %>%
  mutate(independence_date = lubridate::dmy(independence_date))

df_month <- data.frame(month = 1:12, year = 2000, ta = rnorm(12, 15, 2))
dplyr::mutate(df_month, date = lubridate::make_date(year, month))
##    month year       ta       date
## 1      1 2000 15.99580 2000-01-01
## 2      2 2000 11.86363 2000-02-01
## 3      3 2000 15.10576 2000-03-01
## 4      4 2000 12.95497 2000-04-01
## 5      5 2000 16.44845 2000-05-01
## 6      6 2000 16.24575 2000-06-01
## 7      7 2000 14.55560 2000-07-01
## 8      8 2000 15.75073 2000-08-01
## 9      9 2000 15.99762 2000-09-01
## 10    10 2000 15.88499 2000-10-01
## 11    11 2000 13.43720 2000-11-01
## 12    12 2000 16.36287 2000-12-01
# data manipulation with lubridate and dplyr
independence %>%
  filter(year(independence_date) <= 1960) %>%
  pull(country)
##  [1] "Liberia"                          "South Africa"                    
##  [3] "Egypt"                            "Eritrea"                         
##  [5] "Libya"                            "Sudan"                           
##  [7] "Tunisia"                          "Ghana"                           
##  [9] "Guinea"                           "Cameroon"                        
## [11] "Togo"                             "Mali"                            
## [13] "Madagascar"                       "Democratic Republic of the Congo"
## [15] "Benin"                            "Niger"                           
## [17] "Burkina Faso"                     "Ivory Coast"                     
## [19] "Chad"                             "Central African Republic"        
## [21] "Republic of the Congo"            "Gabon"                           
## [23] "Mauritania"
independence %>%
  filter(lubridate::month(independence_date) == 12,
         lubridate::day(independence_date) == 24) %>%
  pull(country)
## [1] "Libya"
independence %>%
  mutate(today = lubridate::today()) %>%
  mutate(independent_since = lubridate::interval(independence_date, today)) %>%
  select(country, independent_since)
## # A tibble: 54 × 2
##    country      independent_since             
##    <chr>        <Interval>                    
##  1 Liberia      1847-07-26 UTC--2022-09-20 UTC
##  2 South Africa 1910-05-31 UTC--2022-09-20 UTC
##  3 Egypt        1922-02-28 UTC--2022-09-20 UTC
##  4 Eritrea      1947-02-10 UTC--2022-09-20 UTC
##  5 Libya        1951-12-24 UTC--2022-09-20 UTC
##  6 Sudan        1956-01-01 UTC--2022-09-20 UTC
##  7 Tunisia      1956-03-20 UTC--2022-09-20 UTC
##  8 Morocco      NA--NA                        
##  9 Ghana        1957-03-06 UTC--2022-09-20 UTC
## 10 Guinea       1958-10-02 UTC--2022-09-20 UTC
## # … with 44 more rows
independence %>%
  mutate(today = lubridate::today()) %>%
  mutate(independent_since = lubridate::interval(independence_date, today)) %>%
  select(country, independent_since) %>%
  mutate(years_independent = as.numeric(independent_since, "years"))
## # A tibble: 54 × 3
##    country      independent_since              years_independent
##    <chr>        <Interval>                                 <dbl>
##  1 Liberia      1847-07-26 UTC--2022-09-20 UTC             175. 
##  2 South Africa 1910-05-31 UTC--2022-09-20 UTC             112. 
##  3 Egypt        1922-02-28 UTC--2022-09-20 UTC             101. 
##  4 Eritrea      1947-02-10 UTC--2022-09-20 UTC              75.6
##  5 Libya        1951-12-24 UTC--2022-09-20 UTC              70.7
##  6 Sudan        1956-01-01 UTC--2022-09-20 UTC              66.7
##  7 Tunisia      1956-03-20 UTC--2022-09-20 UTC              66.5
##  8 Morocco      NA--NA                                      NA  
##  9 Ghana        1957-03-06 UTC--2022-09-20 UTC              65.5
## 10 Guinea       1958-10-02 UTC--2022-09-20 UTC              64.0
## # … with 44 more rows
independence %>%
  filter(colonial_power %in% c("Belgium", "France", "Portugal", "United Kingdom")) %>%
  mutate(today = lubridate::today()) %>%
  mutate(independent_since = lubridate::interval(independence_date, today)) %>%
  mutate(years_independent = as.numeric(independent_since, "years")) %>%
  group_by(colonial_power) %>%
  dplyr::summarise(last_colony_independent_for = min(years_independent, na.rm = TRUE))
## # A tibble: 4 × 2
##   colonial_power last_colony_independent_for
##   <chr>                                <dbl>
## 1 Belgium                               60.2
## 2 France                                45.2
## 3 Portugal                              46.9
## 4 United Kingdom                        46.2
# arithmetic with dates
lubridate::ymd("2018-12-31") + 16
## [1] "2019-01-16"
lubridate::ymd("2018-12-31") + lubridate::days(16)
## [1] "2019-01-16"
lubridate::ymd("2018-12-31") + lubridate::years(1)
## [1] "2019-12-31"
# leap years
library("lubridate")
lubridate::ymd("2016-02-29") + lubridate::years(1)
## [1] NA
ymd("2016-02-29") %m+% years(1)
## [1] "2017-02-28"
ymd("2018-12-31") %m+% months(2)
## [1] "2019-02-28"

10 Data aggregation

10.1 Base-R

R provides a wide array of functions to aid in aggregating data. For simple tabulation and cross-tabulation, the table function is available. For more complex tasks, the available functions can be broken down into two groups: those that are designed to work effectively with arrays and/or lists, like apply, sweep, mapply, sapply, and lapply, and those that are oriented toward data frames (like aggregate and by). There is considerable overlap between the two tools, and the output of one can be converted to the equivalent of the output from another, so often the choice of an appropriate function is a matter of personal taste.

The arguments to the table function can either be individual vectors representing the levels of interest, or a list or data frame composed of such vectors. The result from table will always be an array of as many dimensions as the number of vectors being tabulated, with dimnames extracted from the levels of the cross-tabulated variables. By default, table will not include missing values in its output; to override this, use the exclude=NULL argument. When passed a single vector of values, table returns an object of class table, which can be treated as a named vector. For simple queries regarding individual levels of a tabulated variable, this may be the most convenient form of displaying and storing the values. Alternatively, the output from table can be converted to a data frame using as.data.frame. When multiple vectors are passed to table, an array of as many dimensions as there are vectors is returned.

When passed a data frame, table treats each column as a separate variable, resulting in a table that effectively counts how often each row appears in the data frame. This can be especially useful when the result of table is passed to as.data.frame, since its form will be similar to the input data frame. Since the data frame was formed from a table, all possible combinations, including those with no observations, are included.

Sometimes it is helpful to display the margins of a table, that is, the sum of each row and/or column, in order to understand differences among the levels of the variables from which the table was formed. The addmargins function accepts a table and returns a similar table, with the requested margins added. To specify which dimensions should have margins added, the margin= argument accepts a vector of dimensions; a value of 1 in this vector means a new row with the margins for the columns will be added, and a value of 2 corresponds to a new column containing row margins. The default operation to create the margins is to use the sum function. If some other function is desired, it can be specified through the FUN= argument. When a margin is added, the dimnames for the table are adjusted to include a description of the margin.

When it’s desired to have a table of proportions instead of counts, one strategy would be to use the sweep function dividing each row and column by its corresponding margin. The prop.table function provides a convenient wrapper around this operation. prop.table accepts a table, and a margin= argument, and returns a table of proportions. With no value specified for margin=, the sum of all the cells in the table will be 1; with margin=1, each row of the resulting table will add to 1, and with margin=2, each column will add to 1.

For tables with more than two dimensions, it may be useful to present the table in a “flattened” form using the ftable function. The xtabs function can produce similar results to the table function, but uses the formula language interface. If a variable is given on the left-hand side of the tilde (~), it is interpreted as a vector of counts corresponding to the values of the variables on the right-hand side, making it very easy to convert already tabulated data into R’s notion of a table.

When confronted with an aggregation problem, there are three main considerations:

  1. How are the groups that divide the data defined?
  2. What is the nature of the data to be operated on?
  3. What is the desired end result?

Thinking about these issues will help to point you to the most effective solution for your needs. The following paragraphs should help you make the best choice.

Groups defined as list elements. If the groups you’re interested in are already organized as elements of a list, then sapply or lapply are the appropriate functions; they differ in that lapply always returns a list, while sapply may simplify its output into a vector or array if appropriate. This is a very flexible approach, since the entire data frame for each group is available. Sometimes, if other methods are inappropriate, you can first use the split function to create a suitable list for use with sapply or lapply.

Groups defined by rows or columns of a matrix. When the goal is to operate on each column or row of a matrix, the apply function is the logical choice. apply will usually return its results as a vector or array, but will return a list if the results of operating on the rows or columns are of different dimensions.

Groups based on one or more grouping variables. A wide array of choices is available for the very common task of operating on subsets of data based on the value of a grouping variable. If the computations you desire each involve only a single vector and produce a single scalar as a result (like calculating a scalar-valued statistic for a variable or set of variables), the aggregate function is the most likely choice. Since aggregate always returns a data frame, it is especially useful if the desired result is to create a plot or fit a statistical model to the aggregated data.

If your computations involve a single vector, but the result is a vector (for example, a set of quantiles or a vector of different statistics), tapply is one available option. Unlike aggregate, tapply returns its results in a vector or array for which individual elements are easy to access but may produce a difficult-to-interpret display for complex problems. Another approach to the problem is provided by the reshape package, available through CRAN, It uses a formula interface, and can produce output in a variety of forms. When the desired result requires access to more than one variable at a time (for example, calculating a correlation matrix, or creating a scatter plot), row indices can be passed to tapply to extract the appropriate rows corresponding to each group. Alternatively, the by function can be used. Unlike tapply, the special list returned by by has a print method which will always produce an easily-readable display of the aggregation, but accessing individual elements of the returned list may be inconvenient. Naturally, for tasks like plotting, there is no clear reason to choose one approach over the other.

As mentioned previously, using split and sapply/lapply is a good solution if you find that other methods don’t provide the flexibility you need. Finally, if nothing else seems to work, you can write a loop to iterate over the values returned by unique or intersection, and perform whatever operations you desire. If you take this route, make sure to consider the issues about memory management in loops.

Although most functions in R will automatically operate on each element of a vector, the same is not true for lists. Since many R functions return lists, it’s often useful to process each list element in the same way that R naturally does for vectors. To handle situations like this, R provides two functions: lapply and sapply. Each of these functions takes a list or vector as its first argument, and a function to be applied to each element as its second argument. The difference between the two functions is that lapply will always return its result as a list, while sapply will simplify its output to a vector or matrix if possible. Another important use of sapply relates to data frames. When treated as a list, each column of a data frame retains its mode and class. To get this information from a data frame sapply can be used. When the structure of the data would be lost if sapply tried to simplify it into a vector or array. This same idea can be used to extract columns of a data frame that meet a particular condition. sapply or lapply can be used as an alternative to loops for performing repetitive tasks. When you use these functions, they take care of the details of deciding on the appropriate form of the output, and eliminate the need to incrementally build up a vector or matrix to store the result.

When your data has the added organization of an array, R provides a convenient way to operate on each dimension of the data through the apply function. This function requires three arguments: the array on which to perform the operation, an index telling apply which dimension to operate on, and the function to use. Like sapply, additional arguments to the function can be placed at the end of the argument list. For matrices, a second argument of 1 means “operate on the rows”, and 2 means “operate on the columns”. One common use of apply is in conjunction with functions like scale, which require summary statistics calculated for each column of a matrix. Without additional arguments, the scale function will subtract the mean of each column and divide by the standard deviation, resulting in a matrix of zscores. To use other statistics, appropriate vectors of values can be calculated using apply and provided to scale using the center= and scale= arguments. Similar to sapply, apply will try to return its results in a vector or matrix when appropriate, making it useful in cases where several quantities need to be calculated for each row or column of a matrix. apply will use names that are present in the input matrix or data frame to properly label the result that it returns. If a vector needs to be processed in non-overlapping groups, it is sometimes easiest to temporarily treat the vector as a matrix, and use apply to operate on the groups.

The apply function is very general, and for certain applications, there may be more efficient methods available to perform the necessary computations. For example, if the statistic to be calculated is the sum or the mean, matrix computations will be more efficient than calling apply with the appropriate function. In cases like this, the rowSums, colSums, rowMeans, or functions can be used. Each of these functions accepts a matrix (or a data frame which will be coerced to a matrix), and an optional na.rm= argument to specify the handling of missing values. Since these functions will accept logical values as input as well as numeric values, they can be very useful for counting operations.

A common situation when processing a matrix by rows or columns is that each row or column needs to be processed differently, based on the values of an auxiliary vector which already exists. In cases like this, the sweep function can be used. Like apply, the first two arguments to sweep are the matrix to be operated on and the index of the dimension to be used for repetitive processing. In addition, sweep takes a third argument representing the vector to be used when processing each column, and finally a fourth argument providing the function to be used. sweep operates by building matrices which can be operated on in a single call, so, unlike apply, only functions which can operate on arrays of values can be passed to sweep. All of the built-in binary operators, such as addition (“+”), subtraction (“-”), multiplication (“*”), and division (“/”) can be used, but, in general, it will be necessary to make sure an arbitrary function will work properly with sweep.

To calculate scalar data summaries of one or more columns of a data frame or matrix, the aggregate function can be used. Although this function is limited to returning scalar values, it can operate on multiple columns of its input argument, making it a natural choice for data summaries for multiple variables. The first argument to aggregate is a data frame or matrix containing the variables to be summarized, the second argument is a list containing the variables to be used for grouping, and the third argument is the function to be used to summarize the data. Since the second argument must be a list, when a data frame is being processed it is often convenient to refer to the grouping columns using single bracket subscripts, since columns accessed this way will naturally be in the form of a list. In addition, with more than one grouping variable, specifying the columns this way will insure that the grouping variables’ names will be automatically transferred to the output data frame. If the columns are passed as manually constructed list, aggregate will use names like Group.1 to identify the grouping variables, unless names are provided for the list elements.

To process a single vector based on the values of one or more grouping vectors, the tapply function can also be used. The returned value from tapply will be an array with as many dimensions as there were vectors that defined the groups. Unlike aggregate, tapply is not limited to returning scalars. To convert values like this to data frames, the dimnames of the returned object can be combined with the values. When each element of the vector is of the same length, this operation is fairly straightforward, but the problem becomes difficult when the return values are of different lengths. When more than one grouping variable is used with tapply, and the return value from the function used is not a scalar, the returned object is somewhat more difficult to interpret.

The by function generalizes the idea of tapply to operate on entire data frames broken down by a list of grouping variables. Thus, the first argument to by is a data frame, and the remaining arguments are similar to those of tapply. Each of the rows returned by the by function is in the form that we would like for a data frame containing these results, so it would be natural to use rbind to convert this result to a data frame; however, it is tedious to pass each row to the rbind function individually. In cases like this, the do.call function can usually generalize the operation so that it will be carried out properly regardless of how many elements need to be processed. do.call takes a list of arguments and passes them to a function as if they were the argument list for the function call.

An alternative approach to aggregation is provided by the reshape package, available from CRAN. The functions in this package provide a unified approach to aggregation, based on an extended formula notation. The core idea behind the reshape package is to create a “melted” version of a dataset (through the melt function), which can then be “cast” (with the cast function) into an object with the desired orientation. To melt a data frame, list, or array into the appropriate melted form, it is first necessary to divide the variables into id variables and measure or analysis variables; this should generally be obvious from the nature of the data. By default, melt treats factor and integer variables as id variables, and the remaining variables as analysis variables; if your data is structured according to this convention, no additional information needs to be provided to melt. Otherwise, the id.var= or measure.var= arguments can be used; if you specify one, it will assume all the other variables are of the other type. Once a dataset is melted, it can be cast into a variety of forms. Notice that melt displays the names of variables that have been automatically assigned as id variables. The basic melting operation preserves the id variables, and converts the measured variables into two columns named variable (which identifies which variable is being measured) and value (which contains the actual values). You can use a name other than variable by specifying a variable name= argument to melt. The left-hand side of the formula passed to cast represents the variable(s) which will appear in the columns of the result, and the right-hand side describes the variables which will appear in the rows. Formulas used by cast can include a single dot (.) to represent an overall summary, or three dots ... to represent all variables not otherwise included in the formula. When used for aggregation, an aggregation function should be supplied; if not it defers to using length. To limit the variables that are used, we can use the subset= argument of cast. Since this argument uses the melted data, we need to refer to the variable named variable. A list of functions can be provided to cast function. To provide added flexibility, the vertical bar (|) can be used to cause cast to produce a list instead of a data frame. The default behavior of cast is to only include combinations actually encountered in the data. To include all possible combinations, use the add.missing=TRUE argument. In each of the preceding examples, the dataset was first melted, then repeated calls to cast were carried out. If only a single call to cast is needed, the recast function combines the melt and cast steps into a single call.

pets <- c("dog", "cat", "duck", "chicken", "duck", "cat", "dog")
tt <- table(pets)
tt
## pets
##     cat chicken     dog    duck 
##       2       1       2       2
tt["cat"]
## cat 
##   2
# converting output from table to a data frame
as.data.frame(tt)
##      pets Freq
## 1     cat    2
## 2 chicken    1
## 3     dog    2
## 4    duck    2
# passing multiple vectors to table
hiinc <- state.x77[, "Income"] > median(state.x77[, "Income"])
stateinc <- table(state.region, hiinc)
stateinc
##                hiinc
## state.region    FALSE TRUE
##   Northeast         4    5
##   South            12    4
##   North Central     5    7
##   West              4    9
as.data.frame(stateinc)
##    state.region hiinc Freq
## 1     Northeast FALSE    4
## 2         South FALSE   12
## 3 North Central FALSE    5
## 4          West FALSE    4
## 5     Northeast  TRUE    5
## 6         South  TRUE    4
## 7 North Central  TRUE    7
## 8          West  TRUE    9
# passing a data frame to table and converting it back to a data frame
x <- data.frame(a = c(1, 2, 2, 1, 2, 2, 1), b = c(1, 2, 2, 1, 1, 2, 1), c = c(1, 1, 2, 1, 2, 2, 1))
x
##   a b c
## 1 1 1 1
## 2 2 2 1
## 3 2 2 2
## 4 1 1 1
## 5 2 1 2
## 6 2 2 2
## 7 1 1 1
as.data.frame(table(x))
##   a b c Freq
## 1 1 1 1    3
## 2 2 1 1    0
## 3 1 2 1    0
## 4 2 2 1    1
## 5 1 1 2    0
## 6 2 1 2    1
## 7 1 2 2    0
## 8 2 2 2    2
# adding margins
tt <- table(infert$education, infert$parity)
tt
##          
##            1  2  3  4  5  6
##   0-5yrs   3  0  0  3  0  6
##   6-11yrs 42 42 21 12  3  0
##   12+ yrs 54 39 15  3  3  2
# adding a row of margins
tt1 <- addmargins(tt, 1)
tt1
##          
##            1  2  3  4  5  6
##   0-5yrs   3  0  0  3  0  6
##   6-11yrs 42 42 21 12  3  0
##   12+ yrs 54 39 15  3  3  2
##   Sum     99 81 36 18  6  8
# adding margins to both rows and columns
tt12 <- addmargins(tt, c(1, 2))
tt12
##          
##             1   2   3   4   5   6 Sum
##   0-5yrs    3   0   0   3   0   6  12
##   6-11yrs  42  42  21  12   3   0 120
##   12+ yrs  54  39  15   3   3   2 116
##   Sum      99  81  36  18   6   8 248
# getting proportions
prop.table(tt, 2)
##          
##                    1          2          3          4          5          6
##   0-5yrs  0.03030303 0.00000000 0.00000000 0.16666667 0.00000000 0.75000000
##   6-11yrs 0.42424242 0.51851852 0.58333333 0.66666667 0.50000000 0.00000000
##   12+ yrs 0.54545455 0.48148148 0.41666667 0.16666667 0.50000000 0.25000000
# multidimensional table
ftable(UCBAdmissions)
##                 Dept   A   B   C   D   E   F
## Admit    Gender                             
## Admitted Male        512 353 120 138  53  22
##          Female       89  17 202 131  94  24
## Rejected Male        313 207 205 279 138 351
##          Female       19   8 391 244 299 317
# using xtabs
xtabs(~ state.region + hiinc)
##                hiinc
## state.region    FALSE TRUE
##   Northeast         4    5
##   South            12    4
##   North Central     5    7
##   West              4    9
x <- data.frame(a = c(1, 2, 2, 1, 2, 2, 1), b = c(1, 2, 2, 1, 1, 2, 1), c = c(1, 1, 2, 1, 2, 2, 1))
dfx <- as.data.frame(table(x))
xtabs(Freq ~ a + b + c, data = dfx)
## , , c = 1
## 
##    b
## a   1 2
##   1 3 0
##   2 0 1
## 
## , , c = 2
## 
##    b
## a   1 2
##   1 0 0
##   2 1 2
# mapping a Function to a Vector or List
text <- c("R is a free environment for statistical analysis", "It compiles and runs on a variety of platforms", "Visit the R home page for more information")
result <- strsplit(text, " ")
result
## [[1]]
## [1] "R"           "is"          "a"           "free"        "environment"
## [6] "for"         "statistical" "analysis"   
## 
## [[2]]
## [1] "It"        "compiles"  "and"       "runs"      "on"        "a"        
## [7] "variety"   "of"        "platforms"
## 
## [[3]]
## [1] "Visit"       "the"         "R"           "home"        "page"       
## [6] "for"         "more"        "information"
# reports the number of elements in the returned list (3)
length(result)
## [1] 3
# finding out the length of the individual elements of the list (words)
nwords <- sapply(result, length)
nwords
## [1] 8 9 8
# getting mode and class from a data frame with sapply()
class(ChickWeight)
## [1] "nfnGroupedData" "nfGroupedData"  "groupedData"    "data.frame"
sapply(ChickWeight, class)
## $weight
## [1] "numeric"
## 
## $Time
## [1] "numeric"
## 
## $Chick
## [1] "ordered" "factor" 
## 
## $Diet
## [1] "factor"
# extracting columns of a data frame that meet a particular condition (numeric) with sapply()
df <- ChickWeight[, sapply(ChickWeight, class) == "numeric"]
df
##     weight Time
## 1       42    0
## 2       51    2
## 3       59    4
## 4       64    6
## 5       76    8
## 6       93   10
## 7      106   12
## 8      125   14
## 9      149   16
## 10     171   18
## 11     199   20
## 12     205   21
## 13      40    0
## 14      49    2
## 15      58    4
## 16      72    6
## 17      84    8
## 18     103   10
## 19     122   12
## 20     138   14
## 21     162   16
## 22     187   18
## 23     209   20
## 24     215   21
## 25      43    0
## 26      39    2
## 27      55    4
## 28      67    6
## 29      84    8
## 30      99   10
## 31     115   12
## 32     138   14
## 33     163   16
## 34     187   18
## 35     198   20
## 36     202   21
## 37      42    0
## 38      49    2
## 39      56    4
## 40      67    6
## 41      74    8
## 42      87   10
## 43     102   12
## 44     108   14
## 45     136   16
## 46     154   18
## 47     160   20
## 48     157   21
## 49      41    0
## 50      42    2
## 51      48    4
## 52      60    6
## 53      79    8
## 54     106   10
## 55     141   12
## 56     164   14
## 57     197   16
## 58     199   18
## 59     220   20
## 60     223   21
## 61      41    0
## 62      49    2
## 63      59    4
## 64      74    6
## 65      97    8
## 66     124   10
## 67     141   12
## 68     148   14
## 69     155   16
## 70     160   18
## 71     160   20
## 72     157   21
## 73      41    0
## 74      49    2
## 75      57    4
## 76      71    6
## 77      89    8
## 78     112   10
## 79     146   12
## 80     174   14
## 81     218   16
## 82     250   18
## 83     288   20
## 84     305   21
## 85      42    0
## 86      50    2
## 87      61    4
## 88      71    6
## 89      84    8
## 90      93   10
## 91     110   12
## 92     116   14
## 93     126   16
## 94     134   18
## 95     125   20
## 96      42    0
## 97      51    2
## 98      59    4
## 99      68    6
## 100     85    8
## 101     96   10
## 102     90   12
## 103     92   14
## 104     93   16
## 105    100   18
## 106    100   20
## 107     98   21
## 108     41    0
## 109     44    2
## 110     52    4
## 111     63    6
## 112     74    8
## 113     81   10
## 114     89   12
## 115     96   14
## 116    101   16
## 117    112   18
## 118    120   20
## 119    124   21
## 120     43    0
## 121     51    2
## 122     63    4
## 123     84    6
## 124    112    8
## 125    139   10
## 126    168   12
## 127    177   14
## 128    182   16
## 129    184   18
## 130    181   20
## 131    175   21
## 132     41    0
## 133     49    2
## 134     56    4
## 135     62    6
## 136     72    8
## 137     88   10
## 138    119   12
## 139    135   14
## 140    162   16
## 141    185   18
## 142    195   20
## 143    205   21
## 144     41    0
## 145     48    2
## 146     53    4
## 147     60    6
## 148     65    8
## 149     67   10
## 150     71   12
## 151     70   14
## 152     71   16
## 153     81   18
## 154     91   20
## 155     96   21
## 156     41    0
## 157     49    2
## 158     62    4
## 159     79    6
## 160    101    8
## 161    128   10
## 162    164   12
## 163    192   14
## 164    227   16
## 165    248   18
## 166    259   20
## 167    266   21
## 168     41    0
## 169     49    2
## 170     56    4
## 171     64    6
## 172     68    8
## 173     68   10
## 174     67   12
## 175     68   14
## 176     41    0
## 177     45    2
## 178     49    4
## 179     51    6
## 180     57    8
## 181     51   10
## 182     54   12
## 183     42    0
## 184     51    2
## 185     61    4
## 186     72    6
## 187     83    8
## 188     89   10
## 189     98   12
## 190    103   14
## 191    113   16
## 192    123   18
## 193    133   20
## 194    142   21
## 195     39    0
## 196     35    2
## 197     43    0
## 198     48    2
## 199     55    4
## 200     62    6
## 201     65    8
## 202     71   10
## 203     82   12
## 204     88   14
## 205    106   16
## 206    120   18
## 207    144   20
## 208    157   21
## 209     41    0
## 210     47    2
## 211     54    4
## 212     58    6
## 213     65    8
## 214     73   10
## 215     77   12
## 216     89   14
## 217     98   16
## 218    107   18
## 219    115   20
## 220    117   21
## 221     40    0
## 222     50    2
## 223     62    4
## 224     86    6
## 225    125    8
## 226    163   10
## 227    217   12
## 228    240   14
## 229    275   16
## 230    307   18
## 231    318   20
## 232    331   21
## 233     41    0
## 234     55    2
## 235     64    4
## 236     77    6
## 237     90    8
## 238     95   10
## 239    108   12
## 240    111   14
## 241    131   16
## 242    148   18
## 243    164   20
## 244    167   21
## 245     43    0
## 246     52    2
## 247     61    4
## 248     73    6
## 249     90    8
## 250    103   10
## 251    127   12
## 252    135   14
## 253    145   16
## 254    163   18
## 255    170   20
## 256    175   21
## 257     42    0
## 258     52    2
## 259     58    4
## 260     74    6
## 261     66    8
## 262     68   10
## 263     70   12
## 264     71   14
## 265     72   16
## 266     72   18
## 267     76   20
## 268     74   21
## 269     40    0
## 270     49    2
## 271     62    4
## 272     78    6
## 273    102    8
## 274    124   10
## 275    146   12
## 276    164   14
## 277    197   16
## 278    231   18
## 279    259   20
## 280    265   21
## 281     42    0
## 282     48    2
## 283     57    4
## 284     74    6
## 285     93    8
## 286    114   10
## 287    136   12
## 288    147   14
## 289    169   16
## 290    205   18
## 291    236   20
## 292    251   21
## 293     39    0
## 294     46    2
## 295     58    4
## 296     73    6
## 297     87    8
## 298    100   10
## 299    115   12
## 300    123   14
## 301    144   16
## 302    163   18
## 303    185   20
## 304    192   21
## 305     39    0
## 306     46    2
## 307     58    4
## 308     73    6
## 309     92    8
## 310    114   10
## 311    145   12
## 312    156   14
## 313    184   16
## 314    207   18
## 315    212   20
## 316    233   21
## 317     39    0
## 318     48    2
## 319     59    4
## 320     74    6
## 321     87    8
## 322    106   10
## 323    134   12
## 324    150   14
## 325    187   16
## 326    230   18
## 327    279   20
## 328    309   21
## 329     42    0
## 330     48    2
## 331     59    4
## 332     72    6
## 333     85    8
## 334     98   10
## 335    115   12
## 336    122   14
## 337    143   16
## 338    151   18
## 339    157   20
## 340    150   21
## 341     42    0
## 342     53    2
## 343     62    4
## 344     73    6
## 345     85    8
## 346    102   10
## 347    123   12
## 348    138   14
## 349    170   16
## 350    204   18
## 351    235   20
## 352    256   21
## 353     41    0
## 354     49    2
## 355     65    4
## 356     82    6
## 357    107    8
## 358    129   10
## 359    159   12
## 360    179   14
## 361    221   16
## 362    263   18
## 363    291   20
## 364    305   21
## 365     39    0
## 366     50    2
## 367     63    4
## 368     77    6
## 369     96    8
## 370    111   10
## 371    137   12
## 372    144   14
## 373    151   16
## 374    146   18
## 375    156   20
## 376    147   21
## 377     41    0
## 378     49    2
## 379     63    4
## 380     85    6
## 381    107    8
## 382    134   10
## 383    164   12
## 384    186   14
## 385    235   16
## 386    294   18
## 387    327   20
## 388    341   21
## 389     41    0
## 390     53    2
## 391     64    4
## 392     87    6
## 393    123    8
## 394    158   10
## 395    201   12
## 396    238   14
## 397    287   16
## 398    332   18
## 399    361   20
## 400    373   21
## 401     39    0
## 402     48    2
## 403     61    4
## 404     76    6
## 405     98    8
## 406    116   10
## 407    145   12
## 408    166   14
## 409    198   16
## 410    227   18
## 411    225   20
## 412    220   21
## 413     41    0
## 414     48    2
## 415     56    4
## 416     68    6
## 417     80    8
## 418     83   10
## 419    103   12
## 420    112   14
## 421    135   16
## 422    157   18
## 423    169   20
## 424    178   21
## 425     41    0
## 426     49    2
## 427     61    4
## 428     74    6
## 429     98    8
## 430    109   10
## 431    128   12
## 432    154   14
## 433    192   16
## 434    232   18
## 435    280   20
## 436    290   21
## 437     42    0
## 438     50    2
## 439     61    4
## 440     78    6
## 441     89    8
## 442    109   10
## 443    130   12
## 444    146   14
## 445    170   16
## 446    214   18
## 447    250   20
## 448    272   21
## 449     41    0
## 450     55    2
## 451     66    4
## 452     79    6
## 453    101    8
## 454    120   10
## 455    154   12
## 456    182   14
## 457    215   16
## 458    262   18
## 459    295   20
## 460    321   21
## 461     42    0
## 462     51    2
## 463     66    4
## 464     85    6
## 465    103    8
## 466    124   10
## 467    155   12
## 468    153   14
## 469    175   16
## 470    184   18
## 471    199   20
## 472    204   21
## 473     42    0
## 474     49    2
## 475     63    4
## 476     84    6
## 477    103    8
## 478    126   10
## 479    160   12
## 480    174   14
## 481    204   16
## 482    234   18
## 483    269   20
## 484    281   21
## 485     42    0
## 486     55    2
## 487     69    4
## 488     96    6
## 489    131    8
## 490    157   10
## 491    184   12
## 492    188   14
## 493    197   16
## 494    198   18
## 495    199   20
## 496    200   21
## 497     42    0
## 498     51    2
## 499     65    4
## 500     86    6
## 501    103    8
## 502    118   10
## 503    127   12
## 504    138   14
## 505    145   16
## 506    146   18
## 507     41    0
## 508     50    2
## 509     61    4
## 510     78    6
## 511     98    8
## 512    117   10
## 513    135   12
## 514    141   14
## 515    147   16
## 516    174   18
## 517    197   20
## 518    196   21
## 519     40    0
## 520     52    2
## 521     62    4
## 522     82    6
## 523    101    8
## 524    120   10
## 525    144   12
## 526    156   14
## 527    173   16
## 528    210   18
## 529    231   20
## 530    238   21
## 531     41    0
## 532     53    2
## 533     66    4
## 534     79    6
## 535    100    8
## 536    123   10
## 537    148   12
## 538    157   14
## 539    168   16
## 540    185   18
## 541    210   20
## 542    205   21
## 543     39    0
## 544     50    2
## 545     62    4
## 546     80    6
## 547    104    8
## 548    125   10
## 549    154   12
## 550    170   14
## 551    222   16
## 552    261   18
## 553    303   20
## 554    322   21
## 555     40    0
## 556     53    2
## 557     64    4
## 558     85    6
## 559    108    8
## 560    128   10
## 561    152   12
## 562    166   14
## 563    184   16
## 564    203   18
## 565    233   20
## 566    237   21
## 567     41    0
## 568     54    2
## 569     67    4
## 570     84    6
## 571    105    8
## 572    122   10
## 573    155   12
## 574    175   14
## 575    205   16
## 576    234   18
## 577    264   20
## 578    264   21
# a more complex example of using sapply()
maxcor <- function(i, n = 10, m = 5) { # since sapply will always pass an argument to the applied function, a dummy argument (i) is added to the function.
  mat <- matrix(rnorm(n * m), n, m)
  corr <- cor(mat)
  diag(corr) <- NA
  max(corr, na.rm = TRUE)
}
maxcors <- sapply(1:1000, maxcor, n = 100)
mean(maxcors)
## [1] 0.1552898
# using apply()
sstate <- scale(state.x77, center = apply(state.x77, 2, median), scale = apply(state.x77, 2, mad))
sstate
##                  Population      Income  Illiteracy     Life Exp       Murder
## Alabama         0.268654565 -1.53997252  2.21618392 -1.053891812  1.589871076
## Alaska         -0.855785018  3.09026889  1.05991405 -0.885269122  0.857566823
## Arizona        -0.216757354  0.01892704  1.63804899 -0.081068601  0.183076063
## Arkansas       -0.252047457 -1.96324989  1.83076063 -0.009728232  0.626312848
## California      6.352045703  1.02378062  0.28906747  0.671248015  0.664855177
## Colorado       -0.102929470  0.62803349 -0.48177911  0.898240098 -0.009635582
## Connecticut     0.090474139  1.42641031  0.28906747  1.170630597 -0.722668671
## Delaware       -0.781744997  0.49898551 -0.09635582 -0.398857516 -0.125262570
## Florida         1.881619900  0.50930935  0.67449076 -0.009728232  0.741939835
## Georgia         0.723966101 -0.73643379  2.02347228 -1.384651703  1.358617101
## Hawaii         -0.681756369  0.76396402  1.83076063  1.897005261 -0.125262570
## Idaho          -0.700785347 -0.68825588 -0.67449076  0.775015825 -0.298703051
## Illinois        2.891885618  1.01173614 -0.09635582 -0.346973612  0.664855177
## Indiana         0.856131000 -0.10495902 -0.48177911  0.132952505  0.048177911
## Iowa            0.007784582  0.18754973 -0.86720241  1.222514502 -0.876837987
## Kansas         -0.193230618  0.25809595 -0.67449076  1.235485478 -0.452872367
## Kentucky        0.189770804 -1.38855623  1.25262570 -0.372915564  0.722668671
## Louisiana       0.334737015 -1.67590306  3.56516544 -1.241970966  1.223718949
## Maine          -0.616019901 -1.41952775 -0.48177911 -0.184836410 -0.799753329
## Maryland        0.444067140  1.34209896 -0.09635582 -0.295089707  0.317974215
## Massachusetts   1.029467686  0.40607097  0.28906747  0.749073872 -0.684126342
## Michigan        2.170168396  0.39918841 -0.09635582 -0.029184696  0.819024494
## Minnesota       0.374524877  0.26841979 -0.67449076  1.481934024 -0.876837987
## Mississippi    -0.172125752 -2.44502900  2.79431886 -1.676498667  1.088820797
## Missouri        0.667225150 -0.45596952 -0.28906747  0.009728232  0.472143532
## Montana        -0.723966101 -0.29595003 -0.67449076 -0.074583113 -0.356516544
## Nebraska       -0.447872936 -0.01892704 -0.67449076  1.248456454 -0.761211000
## Nevada         -0.777939201  1.08400301 -0.86720241 -1.066862788  0.896109152
## New Hampshire  -0.701131328 -0.40951225 -0.48177911  0.359944588 -0.684126342
## New Jersey      1.555013449  1.23541930  0.28906747  0.165379946 -0.317974215
## New Mexico     -0.586265500 -1.57954724  2.40889557 -0.230234827  0.549228190
## New York        5.271891740  0.66072564  0.86720241 -0.081068601  0.780482165
## North Carolina  0.900416620 -1.10809196  1.63804899 -0.950124003  0.819024494
## North Dakota   -0.761678075  0.97732335 -0.28906747  1.365195239 -1.050278468
## Ohio            2.732042207  0.07226687 -0.28906747  0.094039577  0.105991405
## Oklahoma       -0.042728704 -0.92226288  0.28906747  0.483168861 -0.086720241
## Oregon         -0.191846692  0.24261020 -0.67449076  0.943638514 -0.510685861
## Pennsylvania    3.121271293 -0.12044478  0.09635582 -0.158894458 -0.144533734
## Rhode Island   -0.659959540  0.06710495  0.67449076  0.794472289 -0.857566823
## South Carolina -0.007784582 -1.52104549  2.60160722 -1.760810012  0.915380316
## South Dakota   -0.746454893 -0.60566517 -0.86720241  0.911211074 -0.992464975
## Tennessee       0.461712192 -1.20100651  1.44533734 -0.366430076  0.799753329
## Texas           3.251706285 -0.56953174  2.40889557  0.145923482  1.031007304
## Utah           -0.565852597 -0.85515793 -0.67449076  1.443021096 -0.452872367
## Vermont        -0.818765008 -1.05303149 -0.67449076  0.625849599 -0.260160722
## Virginia        0.741265172  0.31315642  0.86720241 -0.385886540  0.510685861
## Washington      0.249279606  0.59362069 -0.67449076  0.677733504 -0.491414696
## West Virginia  -0.359647676 -1.55201700  0.86720241 -0.775015825 -0.028906747
## Wisconsin       0.605640459 -0.08775262 -0.48177911  1.170630597 -0.741939835
## Wyoming        -0.851979223  0.08087007 -0.67449076 -0.249691291  0.009635582
##                     HS Grad        Frost         Area
## Alabama        -1.389683548 -1.770538244 -0.101552765
## Alaska          1.564120813  0.702594541 14.572921383
## Arizona         0.564013825 -1.864217516  1.682776836
## Arkansas       -1.552491662 -0.927424794 -0.066355015
## California      1.087325621 -1.770538244  2.904710696
## Colorado        1.238504584  0.964896503  1.408166095
## Connecticut     0.319801653  0.459028434 -1.406060490
## Delaware        0.156993539 -0.215462326 -1.488008364
## Florida        -0.075589482 -1.939160933 -0.005320921
## Georgia        -1.471087605 -1.021104066  0.108011851
## Hawaii          1.005921564 -2.145255332 -1.361586695
## Idaho           0.726821939  0.215462326  0.808097094
## Illinois       -0.075589482  0.234198180  0.041856015
## Indiana        -0.040702029  0.140518908 -0.517295957
## Iowa            0.668676184  0.477764288  0.047347661
## Kansas          0.773338543 -0.009367927  0.782772925
## Kentucky       -1.715299776 -0.365349161 -0.416198458
## Louisiana      -1.285021188 -1.920425079 -0.265960688
## Maine           0.168622690  0.871217231 -0.664602952
## Maryland       -0.110476935 -0.252934035 -1.262964705
## Massachusetts   0.610530429 -0.215462326 -1.321722469
## Michigan       -0.052331180  0.196726472  0.072273473
## Minnesota       0.505868070  0.852481377  0.711694525
## Mississippi    -1.424571001 -1.208462611 -0.198638233
## Missouri       -0.517497221 -0.121783054  0.418787783
## Montana         0.691934486  0.758802104  2.598145974
## Nebraska        0.703563637  0.459028434  0.631852256
## Nevada          1.389683548  1.377085301  1.582390690
## New Hampshire   0.505868070  1.114783339 -1.287549067
## New Jersey     -0.087218633  0.009367927 -1.330400977
## New Mexico      0.226768445  0.103047199  1.910267550
## New York       -0.063960331 -0.608915269 -0.183415277
## North Carolina -1.715299776 -0.646386978 -0.155900140
## North Dakota   -0.343059955  1.339613592  0.426698029
## Ohio           -0.005814576  0.177990617 -0.378496745
## Oklahoma       -0.191880992 -0.608915269  0.412727055
## Oregon          0.784967694 -1.320877737  1.192426934
## Pennsylvania   -0.354689106  0.215462326 -0.264936340
## Rhode Island   -0.796596845  0.234198180 -1.514556061
## South Carolina -1.796703833 -0.927424794 -0.684378567
## South Dakota    0.005814576  1.077311630  0.616828479
## Tennessee      -1.331537792 -0.833745522 -0.368452439
## Texas          -0.680305335 -1.489500427  5.914388652
## Utah            1.633895719  0.421556725  0.791565249
## Vermont         0.447722314  1.002368212 -1.280720078
## Virginia       -0.633788731 -0.552707706 -0.412499422
## Washington      1.191987980 -1.545707990  0.349786534
## West Virginia  -1.354796094 -0.271669889 -0.859513695
## Wisconsin       0.145364388  0.646386978  0.005320921
## Wyoming         1.122213074  1.096047484  1.221421685
## attr(,"scaled:center")
## Population     Income Illiteracy   Life Exp     Murder    HS Grad      Frost 
##   2838.500   4519.000      0.950     70.675      6.850     53.250    114.500 
##       Area 
##  54277.000 
## attr(,"scaled:scale")
##   Population       Income   Illiteracy     Life Exp       Murder      HS Grad 
##  2890.328700   581.179200     0.518910     1.541904     5.189100     8.599080 
##        Frost         Area 
##    53.373600 35144.291700
# another example
summfun <- function(x) c(n = sum(!is.na(x)), mean = mean(x), sd = sd(x))
x <- apply(state.x77, 2, summfun)
t(x)
##             n       mean            sd
## Population 50  4246.4200  4464.4914334
## Income     50  4435.8000   614.4699392
## Illiteracy 50     1.1700     0.6095331
## Life Exp   50    70.8786     1.3423936
## Murder     50     7.3780     3.6915397
## HS Grad    50    53.1080     8.0769978
## Frost      50   104.4600    51.9808481
## Area       50 70735.8800 85327.2996224
# apply() for non-overlapping groups
x <- 1:12
apply(matrix(x, ncol = 3, byrow = TRUE), 1, sum) # first need to be converted into a matrix
## [1]  6 15 24 33
# specific statistics calculation without apply()
mns <- colMeans(USJudgeRatings)
mns
##     CONT     INTG     DMNR     DILG     CFMG     DECI     PREP     FAMI 
## 7.437209 8.020930 7.516279 7.693023 7.479070 7.565116 7.467442 7.488372 
##     ORAL     WRIT     PHYS     RTEN 
## 7.293023 7.383721 7.934884 7.602326
jscore <- rowSums(USJudgeRatings >= 8)
head(jscore)
##  AARONSON,L.H. ALEXANDER,J.M. ARMENTANO,A.J.    BERDON,R.I.   BRACKEN,J.J. 
##              1              8              1             11              0 
##     BURNS,E.B. 
##             10
# using sweep()
maxes <- apply(state.x77, 2, max)
swept <- sweep(state.x77, 2, maxes, "/")
head(swept)
##            Population    Income Illiteracy  Life Exp    Murder   HS Grad
## Alabama    0.17053496 0.5738717  0.7500000 0.9381793 1.0000000 0.6136701
## Alaska     0.01721861 1.0000000  0.5357143 0.9417120 0.7483444 0.9910847
## Arizona    0.10434947 0.7173397  0.6428571 0.9585598 0.5165563 0.8632987
## Arkansas   0.09953769 0.5349169  0.6785714 0.9600543 0.6688742 0.5928678
## California 1.00000000 0.8098179  0.3928571 0.9743207 0.6821192 0.9301634
## Colorado   0.11986980 0.7733967  0.2500000 0.9790761 0.4503311 0.9494799
##                 Frost       Area
## Alabama    0.10638298 0.08952178
## Alaska     0.80851064 1.00000000
## Arizona    0.07978723 0.20023057
## Arkansas   0.34574468 0.09170562
## California 0.10638298 0.27604549
## Colorado   0.88297872 0.18319233
# non-working sweep() example
# right way to proceed
meds <- apply(state.x77, 2, median)
meanmed <- function(var, med) mean(var[var > med])
meanmed(state.x77[, 1], meds[1]) # for every col
## [1] 7136.16
meanmed(state.x77[, 2], meds[2])
## [1] 4917.92
# opposite to using sweep()  -> doesn´t work! returns only a single value
sweep(state.x77, 2, meds, meanmed)
## [1] 15569.75
# solution via mapply()
mapply(meanmed, as.data.frame(state.x77), meds)
## Population     Income Illiteracy   Life Exp     Murder    HS Grad      Frost 
##   7136.160   4917.920      1.660     71.950     10.544     59.524    146.840 
##       Area 
## 112213.400
# mapping a function based on groups
aggregate(iris[-5], iris[5], mean)
##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa        5.006       3.428        1.462       0.246
## 2 versicolor        5.936       2.770        4.260       1.326
## 3  virginica        6.588       2.974        5.552       2.026
weights <- aggregate(ChickWeight$weight, ChickWeight[c("Time", "Diet")], mean)
# or
# weights <- aggregate(ChickWeight$weight, list(Time = ChickWeight$Time, Diet = ChickWeight$Diet), mean)
weights
##    Time Diet         x
## 1     0    1  41.40000
## 2     2    1  47.25000
## 3     4    1  56.47368
## 4     6    1  66.78947
## 5     8    1  79.68421
## 6    10    1  93.05263
## 7    12    1 108.52632
## 8    14    1 123.38889
## 9    16    1 144.64706
## 10   18    1 158.94118
## 11   20    1 170.41176
## 12   21    1 177.75000
## 13    0    2  40.70000
## 14    2    2  49.40000
## 15    4    2  59.80000
## 16    6    2  75.40000
## 17    8    2  91.70000
## 18   10    2 108.50000
## 19   12    2 131.30000
## 20   14    2 141.90000
## 21   16    2 164.70000
## 22   18    2 187.70000
## 23   20    2 205.60000
## 24   21    2 214.70000
## 25    0    3  40.80000
## 26    2    3  50.40000
## 27    4    3  62.20000
## 28    6    3  77.90000
## 29    8    3  98.40000
## 30   10    3 117.10000
## 31   12    3 144.40000
## 32   14    3 164.50000
## 33   16    3 197.40000
## 34   18    3 233.10000
## 35   20    3 258.90000
## 36   21    3 270.30000
## 37    0    4  41.00000
## 38    2    4  51.80000
## 39    4    4  64.50000
## 40    6    4  83.90000
## 41    8    4 105.60000
## 42   10    4 126.00000
## 43   12    4 151.40000
## 44   14    4 161.80000
## 45   16    4 182.00000
## 46   18    4 202.90000
## 47   20    4 233.88889
## 48   21    4 238.55556
# a single vector based on the values or one or more grouping vectors using tapply()
maxweight <- tapply(PlantGrowth$weight, PlantGrowth$group, max)
as.data.frame(as.table(maxweight), responseName = "MaxWeight")
##   Var1 MaxWeight
## 1 ctrl      6.11
## 2 trt1      6.03
## 3 trt2      6.31
ranges <- tapply(PlantGrowth$weight, PlantGrowth$group, range)
ranges
## $ctrl
## [1] 4.17 6.11
## 
## $trt1
## [1] 3.59 6.03
## 
## $trt2
## [1] 4.92 6.31
# converting the results to a data frame
data.frame(group = dimnames(ranges)[[1]], matrix(unlist(ranges), ncol = 2, byrow = TRUE))
##   group   X1   X2
## 1  ctrl 4.17 6.11
## 2  trt1 3.59 6.03
## 3  trt2 4.92 6.31
# using more than one grouping variable with tapply() and the returned value is NOT a scalar
ranges1 <- tapply(CO2$uptake, CO2[c("Type", "Treatment")], range)
data.frame(expand.grid(dimnames(ranges1)), matrix(unlist(ranges1), byrow = TRUE, ncol = 2))
##          Type  Treatment   X1   X2
## 1      Quebec nonchilled 13.6 45.5
## 2 Mississippi nonchilled 10.6 35.5
## 3      Quebec    chilled  9.3 42.4
## 4 Mississippi    chilled  7.7 22.2
# using by()
sumfun <- function(x) data.frame(n = length(x$uptake), mean = mean(x$uptake), sd = sd(x$uptake))
bb <- by(CO2, CO2[c("Type", "Treatment")], sumfun)
bb
## Type: Quebec
## Treatment: nonchilled
##    n     mean       sd
## 1 21 35.33333 9.596371
## ------------------------------------------------------------ 
## Type: Mississippi
## Treatment: nonchilled
##    n     mean       sd
## 1 21 25.95238 7.402136
## ------------------------------------------------------------ 
## Type: Quebec
## Treatment: chilled
##    n     mean       sd
## 1 21 31.75238 9.644823
## ------------------------------------------------------------ 
## Type: Mississippi
## Treatment: chilled
##    n     mean       sd
## 1 21 15.81429 4.058976
cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
##          Type  Treatment  n     mean       sd
## 1      Quebec nonchilled 21 35.33333 9.596371
## 2 Mississippi nonchilled 21 25.95238 7.402136
## 3      Quebec    chilled 21 31.75238 9.644823
## 4 Mississippi    chilled 21 15.81429 4.058976
states <- data.frame(state.x77, state = row.names(state.x77), region = state.region, row.names = 1:50)
head(states)
##   Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area      state
## 1       3615   3624        2.1    69.05   15.1    41.3    20  50708    Alabama
## 2        365   6315        1.5    69.31   11.3    66.7   152 566432     Alaska
## 3       2212   4530        1.8    70.55    7.8    58.1    15 113417    Arizona
## 4       2110   3378        1.9    70.66   10.1    39.9    65  51945   Arkansas
## 5      21198   5114        1.1    71.71   10.3    62.6    20 156361 California
## 6       2541   4884        0.7    72.06    6.8    63.9   166 103766   Colorado
##   region
## 1  South
## 2   West
## 3   West
## 4  South
## 5   West
## 6   West
mstates <- reshape::melt(states)
head(mstates)
##        state region   variable value
## 1    Alabama  South Population  3615
## 2     Alaska   West Population   365
## 3    Arizona   West Population  2212
## 4   Arkansas  South Population  2110
## 5 California   West Population 21198
## 6   Colorado   West Population  2541
reshape::cast(mstates, region ~ variable, mean)
##          region Population   Income Illiteracy Life.Exp    Murder  HS.Grad
## 1     Northeast   5495.111 4570.222   1.000000 71.26444  4.722222 53.96667
## 2         South   4208.125 4011.938   1.737500 69.70625 10.581250 44.34375
## 3 North Central   4803.000 4611.083   0.700000 71.76667  5.275000 54.51667
## 4          West   2915.308 4702.615   1.023077 71.23462  7.215385 62.00000
##      Frost      Area
## 1 132.7778  18141.00
## 2  64.6250  54605.12
## 3 138.8333  62652.00
## 4 102.1538 134463.00
reshape::cast(mstates, variable ~ region, mean)
##     variable    Northeast       South North Central          West
## 1 Population  5495.111111  4208.12500    4803.00000   2915.307692
## 2     Income  4570.222222  4011.93750    4611.08333   4702.615385
## 3 Illiteracy     1.000000     1.73750       0.70000      1.023077
## 4   Life.Exp    71.264444    69.70625      71.76667     71.234615
## 5     Murder     4.722222    10.58125       5.27500      7.215385
## 6    HS.Grad    53.966667    44.34375      54.51667     62.000000
## 7      Frost   132.777778    64.62500     138.83333    102.153846
## 8       Area 18141.000000 54605.12500   62652.00000 134463.000000
reshape::cast(mstates, region ~ variable, mean, subset = variable %in% c("Population", "Life.Exp"))
##          region Population Life.Exp
## 1     Northeast   5495.111 71.26444
## 2         South   4208.125 69.70625
## 3 North Central   4803.000 71.76667
## 4          West   2915.308 71.23462
reshape::cast(mstates, . ~ variable, c(mean, median, sd), subset = variable %in% c("Population", "Life.Exp"))
##   value Population_mean Population_median Population_sd Life.Exp_mean
## 1 (all)         4246.42            2838.5      4464.491       70.8786
##   Life.Exp_median Life.Exp_sd
## 1          70.675    1.342394
# or
# reshape::cast(mstates, variable ~ ., c(mean, median, sd), subset = variable %in% c("Population", "Life.Exp"))
# using a grouping variable
reshape::cast(mstates, region ~ variable, c(mean, median, sd), subset = variable %in% c("Population", "Life.Exp")) # data frame
##          region Population_mean Population_median Population_sd Life.Exp_mean
## 1     Northeast        5495.111            3100.0      6079.565      71.26444
## 2         South        4208.125            3710.5      2779.508      69.70625
## 3 North Central        4803.000            4255.0      3702.828      71.76667
## 4          West        2915.308            1144.0      5578.607      71.23462
##   Life.Exp_median Life.Exp_sd
## 1           71.23   0.7438769
## 2           70.07   1.0221994
## 3           72.28   1.0367285
## 4           71.71   1.3519715
reshape::cast(mstates, variable ~ . | region, c(mean, median, sd), subset = variable %in% c("Population", "Life.Exp")) # list
## $Northeast
##     variable       mean  median           sd
## 1 Population 5495.11111 3100.00 6079.5651457
## 2   Life.Exp   71.26444   71.23    0.7438769
## 
## $South
##     variable       mean  median          sd
## 1 Population 4208.12500 3710.50 2779.508251
## 2   Life.Exp   69.70625   70.07    1.022199
## 
## $`North Central`
##     variable       mean  median          sd
## 1 Population 4803.00000 4255.00 3702.827593
## 2   Life.Exp   71.76667   72.28    1.036729
## 
## $West
##     variable       mean  median          sd
## 1 Population 2915.30769 1144.00 5578.607015
## 2   Life.Exp   71.23462   71.71    1.351971
# another example
mChick <- reshape::melt(ChickWeight, measure.var = "weight")
head(reshape::cast(mChick, Diet + Time ~ variable, median))
##   Diet Time weight
## 1    1    0     41
## 2    1    2     49
## 3    1    4     56
## 4    1    6     67
## 5    1    8     79
## 6    1   10     93
reshape::cast(mChick, Diet ~ Time + variable, mean)
##   Diet 0_weight 2_weight 4_weight 6_weight  8_weight 10_weight 12_weight
## 1    1     41.4    47.25 56.47368 66.78947  79.68421  93.05263  108.5263
## 2    2     40.7    49.40 59.80000 75.40000  91.70000 108.50000  131.3000
## 3    3     40.8    50.40 62.20000 77.90000  98.40000 117.10000  144.4000
## 4    4     41.0    51.80 64.50000 83.90000 105.60000 126.00000  151.4000
##   14_weight 16_weight 18_weight 20_weight 21_weight
## 1  123.3889  144.6471  158.9412  170.4118  177.7500
## 2  141.9000  164.7000  187.7000  205.6000  214.7000
## 3  164.5000  197.4000  233.1000  258.9000  270.3000
## 4  161.8000  182.0000  202.9000  233.8889  238.5556
reshape::cast(mChick, Time ~ variable | Diet, mean)
## $`1`
##    Time    weight
## 1     0  41.40000
## 2     2  47.25000
## 3     4  56.47368
## 4     6  66.78947
## 5     8  79.68421
## 6    10  93.05263
## 7    12 108.52632
## 8    14 123.38889
## 9    16 144.64706
## 10   18 158.94118
## 11   20 170.41176
## 12   21 177.75000
## 
## $`2`
##    Time weight
## 1     0   40.7
## 2     2   49.4
## 3     4   59.8
## 4     6   75.4
## 5     8   91.7
## 6    10  108.5
## 7    12  131.3
## 8    14  141.9
## 9    16  164.7
## 10   18  187.7
## 11   20  205.6
## 12   21  214.7
## 
## $`3`
##    Time weight
## 1     0   40.8
## 2     2   50.4
## 3     4   62.2
## 4     6   77.9
## 5     8   98.4
## 6    10  117.1
## 7    12  144.4
## 8    14  164.5
## 9    16  197.4
## 10   18  233.1
## 11   20  258.9
## 12   21  270.3
## 
## $`4`
##    Time   weight
## 1     0  41.0000
## 2     2  51.8000
## 3     4  64.5000
## 4     6  83.9000
## 5     8 105.6000
## 6    10 126.0000
## 7    12 151.4000
## 8    14 161.8000
## 9    16 182.0000
## 10   18 202.9000
## 11   20 233.8889
## 12   21 238.5556
# including all possible combinations
xChickWeight <- subset(ChickWeight, !(Diet == 1 & Time == 4))
mxChick <- reshape::melt(xChickWeight, measure.var = "weight")
head(reshape::cast(mxChick, Diet + Time ~ variable, median))
##   Diet Time weight
## 1    1    0     41
## 2    1    2     49
## 3    1    6     67
## 4    1    8     79
## 5    1   10     93
## 6    1   12    106
head(reshape::cast(mxChick, Diet + Time ~ variable, median, add.missing = TRUE))
##   Diet Time weight
## 1    1    0     41
## 2    1    2     49
## 3    1    4     NA
## 4    1    6     67
## 5    1    8     79
## 6    1   10     93
# using recast()
head(reshape::recast(xChickWeight, measure.var = "weight", Diet + Time ~ variable, median, add.missing = TRUE))
##   Diet Time weight
## 1    1    0     41
## 2    1    2     49
## 3    1    4     NA
## 4    1    6     67
## 5    1    8     79
## 6    1   10     93

10.2 Tidyverse

group_by() is a very useful verb; as the name implies, it allows you to create groups and then, for example, compute descriptive statistics by groups. Once your data is grouped, the operations that will follow will be executed inside each group.

Counting is one of the most common tasks you do when working with data. Counting may sound simple, but it can get complicated quickly. Intuitively, one would think that the count function counts the values of discrete variables: The number of players on a team, the number of cars, etc. However, count can also be used to calculate the sum of a variable for a particular group or groups. count creates a new data frame with the grouping variable and the frequency or sum variable. This is not always what you want. Sometimes you want to add counts to your existing data frame.

The argument wt stands for weighted counts. While count calculates the frequency of values within a group without specifying the wt argument (n = n()), wt calculates the sum of a continuous variable for certain groups (n = sum(<VARIABLE>)). This technique has its advantages and disadvantages. On the positive side, we only need three lines of code instead of six. On the downside, the code is less explicit, and without knowing the inner workings of count, it’s hard to tell that the function is calculating sums.

add_tally() does something similar than add_count(). The only difference is that add_tally calculates the sum of a given variable instead of a count. add_tally has no argument for grouping the data. You must accomplish this with group_by.

One of the credos of programming is “Don’t repeat yourself”. The tidyverse team has developed a set of functions that make it easier not to repeat ourselves: the across function that follows this general structure:

%>% ( across( .cols = , .fns = , .names = ) )

A couple of things are important here:

  • The function across only works inside dplyr verbs (e.g. mutate).
  • The function has three important arguments: .cols stands for the column to apply a function to. You can use the tidyselect functions here; .fns stands for the function(s) that will be applied to these columns; .names is used whenever you want to change the names of the selected columns.
  • The .fns argument takes three different values: (1) A simple function (e.g. mean). (2) A purrr-style lambda function (e.g. ~ mean(.x, na.rm = TRUE)). You use these lamba functions if you need to change the arguments of a function. (3) A list of functions (e.g. list(mean = mean, sd = sd)). The list can also be combined with lambda functions (e.g. list(mean = mean(.x, na.rm = TRUE), sd = sd(.x, na.rm = TRUE))).

You can be quite creative with across and summmarise, because you can calculate any summary statistic for many columns. The mean, standard deviation or median are just obvious examples. Making the problem smaller and easier to digest, you might simply ask yourself, “What summary statistics can I compute from a vector?”

# loading the data
data(Gasoline, package = "plm")
gasoline <- as_tibble(Gasoline)
gasoline <- gasoline %>%
  mutate(country = tolower(country))

# group_by()
gasoline %>%
  group_by(country)
## # A tibble: 342 × 6
## # Groups:   country [18]
##    country  year lgaspcar lincomep  lrpmg lcarpcap
##    <chr>   <int>    <dbl>    <dbl>  <dbl>    <dbl>
##  1 austria  1960     4.17    -6.47 -0.335    -9.77
##  2 austria  1961     4.10    -6.43 -0.351    -9.61
##  3 austria  1962     4.07    -6.41 -0.380    -9.46
##  4 austria  1963     4.06    -6.37 -0.414    -9.34
##  5 austria  1964     4.04    -6.32 -0.445    -9.24
##  6 austria  1965     4.03    -6.29 -0.497    -9.12
##  7 austria  1966     4.05    -6.25 -0.467    -9.02
##  8 austria  1967     4.05    -6.23 -0.506    -8.93
##  9 austria  1968     4.05    -6.21 -0.522    -8.85
## 10 austria  1969     4.05    -6.15 -0.559    -8.79
## # … with 332 more rows
gasoline %>%
  group_by(country, year)
## # A tibble: 342 × 6
## # Groups:   country, year [342]
##    country  year lgaspcar lincomep  lrpmg lcarpcap
##    <chr>   <int>    <dbl>    <dbl>  <dbl>    <dbl>
##  1 austria  1960     4.17    -6.47 -0.335    -9.77
##  2 austria  1961     4.10    -6.43 -0.351    -9.61
##  3 austria  1962     4.07    -6.41 -0.380    -9.46
##  4 austria  1963     4.06    -6.37 -0.414    -9.34
##  5 austria  1964     4.04    -6.32 -0.445    -9.24
##  6 austria  1965     4.03    -6.29 -0.497    -9.12
##  7 austria  1966     4.05    -6.25 -0.467    -9.02
##  8 austria  1967     4.05    -6.23 -0.506    -8.93
##  9 austria  1968     4.05    -6.21 -0.522    -8.85
## 10 austria  1969     4.05    -6.15 -0.559    -8.79
## # … with 332 more rows
gasoline %>%
  group_by(country, year) %>% 
  ungroup()
## # A tibble: 342 × 6
##    country  year lgaspcar lincomep  lrpmg lcarpcap
##    <chr>   <int>    <dbl>    <dbl>  <dbl>    <dbl>
##  1 austria  1960     4.17    -6.47 -0.335    -9.77
##  2 austria  1961     4.10    -6.43 -0.351    -9.61
##  3 austria  1962     4.07    -6.41 -0.380    -9.46
##  4 austria  1963     4.06    -6.37 -0.414    -9.34
##  5 austria  1964     4.04    -6.32 -0.445    -9.24
##  6 austria  1965     4.03    -6.29 -0.497    -9.12
##  7 austria  1966     4.05    -6.25 -0.467    -9.02
##  8 austria  1967     4.05    -6.23 -0.506    -8.93
##  9 austria  1968     4.05    -6.21 -0.522    -8.85
## 10 austria  1969     4.05    -6.15 -0.559    -8.79
## # … with 332 more rows
# getting summary statistics
gasoline %>%
  group_by(country) %>%
  dplyr::summarise(mean_gaspcar = mean(lgaspcar))
## # A tibble: 18 × 2
##    country  mean_gaspcar
##    <chr>           <dbl>
##  1 austria          4.06
##  2 belgium          3.92
##  3 canada           4.86
##  4 denmark          4.19
##  5 france           3.82
##  6 germany          3.89
##  7 greece           4.88
##  8 ireland          4.23
##  9 italy            3.73
## 10 japan            4.70
## 11 netherla         4.08
## 12 norway           4.11
## 13 spain            4.06
## 14 sweden           4.01
## 15 switzerl         4.24
## 16 turkey           5.77
## 17 u.k.             3.98
## 18 u.s.a.           4.82
gasoline %>%
  group_by(country) %>%
  dplyr::summarise(mean_gaspcar = mean(lgaspcar)) %>%
  filter(country == "france")
## # A tibble: 1 × 2
##   country mean_gaspcar
##   <chr>          <dbl>
## 1 france          3.82
desc_gasoline <- gasoline %>%
  group_by(country) %>%
  dplyr::summarise(
    mean_gaspcar = mean(lgaspcar),
    sd_gaspcar = sd(lgaspcar),
    max_gaspcar = max(lgaspcar),
    min_gaspcar = min(lgaspcar)
  )

desc_gasoline %>%
  filter(max(mean_gaspcar) == mean_gaspcar)
## # A tibble: 1 × 5
##   country mean_gaspcar sd_gaspcar max_gaspcar min_gaspcar
##   <chr>          <dbl>      <dbl>       <dbl>       <dbl>
## 1 turkey          5.77      0.329        6.16        5.14
desc_gasoline %>%
  filter(min(mean_gaspcar) == mean_gaspcar)
## # A tibble: 1 × 5
##   country mean_gaspcar sd_gaspcar max_gaspcar min_gaspcar
##   <chr>          <dbl>      <dbl>       <dbl>       <dbl>
## 1 italy           3.73      0.220        4.05        3.38
# group_by() and across()
gasoline <- gasoline %>%
  mutate(
    year = as.character(year),
    country = as.character(country)
  )

gasoline %>%
  group_by(across(is.character)) %>%
  dplyr::summarise(mean(lincomep))
## # A tibble: 342 × 3
## # Groups:   country [18]
##    country year  `mean(lincomep)`
##    <chr>   <chr>            <dbl>
##  1 austria 1960             -6.47
##  2 austria 1961             -6.43
##  3 austria 1962             -6.41
##  4 austria 1963             -6.37
##  5 austria 1964             -6.32
##  6 austria 1965             -6.29
##  7 austria 1966             -6.25
##  8 austria 1967             -6.23
##  9 austria 1968             -6.21
## 10 austria 1969             -6.15
## # … with 332 more rows
gasoline %>%
  group_by(across(c(1, 2))) %>%
  dplyr::summarise(mean(lincomep))
## # A tibble: 342 × 3
## # Groups:   country [18]
##    country year  `mean(lincomep)`
##    <chr>   <chr>            <dbl>
##  1 austria 1960             -6.47
##  2 austria 1961             -6.43
##  3 austria 1962             -6.41
##  4 austria 1963             -6.37
##  5 austria 1964             -6.32
##  6 austria 1965             -6.29
##  7 austria 1966             -6.25
##  8 austria 1967             -6.23
##  9 austria 1968             -6.21
## 10 austria 1969             -6.15
## # … with 332 more rows
gasoline %>%
  group_by(across(seq(1:2))) %>%
  dplyr::summarise(mean(lincomep))
## # A tibble: 342 × 3
## # Groups:   country [18]
##    country year  `mean(lincomep)`
##    <chr>   <chr>            <dbl>
##  1 austria 1960             -6.47
##  2 austria 1961             -6.43
##  3 austria 1962             -6.41
##  4 austria 1963             -6.37
##  5 austria 1964             -6.32
##  6 austria 1965             -6.29
##  7 austria 1966             -6.25
##  8 austria 1967             -6.23
##  9 austria 1968             -6.21
## 10 austria 1969             -6.15
## # … with 332 more rows
# another across() example
mpg %>%
  group_by(manufacturer) %>%
  dplyr::summarise(
    across(
      .cols = c("displ", "cty"),
      .fns = ~ mean(.x, na.rm = TRUE),
      .names = "mean_{.col}"
    )
  )
## # A tibble: 15 × 3
##    manufacturer mean_displ mean_cty
##    <chr>             <dbl>    <dbl>
##  1 audi               2.54     17.6
##  2 chevrolet          5.06     15  
##  3 dodge              4.38     13.1
##  4 ford               4.54     14  
##  5 honda              1.71     24.4
##  6 hyundai            2.43     18.6
##  7 jeep               4.58     13.5
##  8 land rover         4.3      11.5
##  9 lincoln            5.4      11.3
## 10 mercury            4.4      13.2
## 11 nissan             3.27     18.1
## 12 pontiac            3.96     17  
## 13 subaru             2.46     19.3
## 14 toyota             2.95     18.5
## 15 volkswagen         2.26     20.9
# same example as before with changed col names and a func list()
mpg %>%
  group_by(manufacturer) %>%
  dplyr::summarise(
    across(
      .cols = c("displ", "cty"),
      .fns = list(mean = mean, sd = sd),
      .names = "{.fn}_{.col}"
    )
  )
## # A tibble: 15 × 5
##    manufacturer mean_displ sd_displ mean_cty sd_cty
##    <chr>             <dbl>    <dbl>    <dbl>  <dbl>
##  1 audi               2.54    0.673     17.6  1.97 
##  2 chevrolet          5.06    1.37      15    2.92 
##  3 dodge              4.38    0.868     13.1  2.49 
##  4 ford               4.54    0.541     14    1.91 
##  5 honda              1.71    0.145     24.4  1.94 
##  6 hyundai            2.43    0.365     18.6  1.50 
##  7 jeep               4.58    1.02      13.5  2.51 
##  8 land rover         4.3     0.258     11.5  0.577
##  9 lincoln            5.4     0         11.3  0.577
## 10 mercury            4.4     0.490     13.2  0.5  
## 11 nissan             3.27    0.864     18.1  3.43 
## 12 pontiac            3.96    0.808     17    1    
## 13 subaru             2.46    0.109     19.3  0.914
## 14 toyota             2.95    0.931     18.5  4.05 
## 15 volkswagen         2.26    0.443     20.9  4.56
# another example with 3 funcs
mpg %>%
  group_by(manufacturer) %>%
  dplyr::summarise(
    across(
      .cols = where(is.numeric),
      .fns = list(
        mean = mean, sd = sd,
        median = median
      ),
      .names = "{.fn}_{.col}"
    )
  ) %>%
  glimpse()
## Rows: 15
## Columns: 16
## $ manufacturer <chr> "audi", "chevrolet", "dodge", "ford", "honda", "hyundai",…
## $ mean_displ   <dbl> 2.544444, 5.063158, 4.378378, 4.536000, 1.711111, 2.42857…
## $ sd_displ     <dbl> 0.6732032, 1.3704057, 0.8679910, 0.5407402, 0.1452966, 0.…
## $ median_displ <dbl> 2.8, 5.3, 4.7, 4.6, 1.6, 2.4, 4.7, 4.3, 5.4, 4.3, 3.3, 3.…
## $ mean_year    <dbl> 2003.500, 2004.684, 2004.108, 2002.600, 2003.000, 2004.14…
## $ sd_year      <dbl> 4.630462, 4.460352, 4.520225, 4.500000, 4.743416, 4.62197…
## $ median_year  <dbl> 2003.5, 2008.0, 2008.0, 1999.0, 1999.0, 2008.0, 2008.0, 2…
## $ mean_cyl     <dbl> 5.222222, 7.263158, 7.081081, 7.200000, 4.000000, 4.85714…
## $ sd_cyl       <dbl> 1.2153700, 1.3679711, 1.1150082, 1.0000000, 0.0000000, 1.…
## $ median_cyl   <dbl> 6, 8, 8, 8, 4, 4, 8, 8, 8, 7, 6, 6, 4, 4, 4
## $ mean_cty     <dbl> 17.61111, 15.00000, 13.13514, 14.00000, 24.44444, 18.6428…
## $ sd_cty       <dbl> 1.9745108, 2.9249881, 2.4850907, 1.9148542, 1.9436506, 1.…
## $ median_cty   <dbl> 17.5, 15.0, 13.0, 14.0, 24.0, 18.5, 14.0, 11.5, 11.0, 13.…
## $ mean_hwy     <dbl> 26.44444, 21.89474, 17.94595, 19.36000, 32.55556, 26.8571…
## $ sd_hwy       <dbl> 2.175322, 5.108759, 3.574182, 3.327662, 2.554952, 2.17881…
## $ median_hwy   <dbl> 26.0, 23.0, 17.0, 18.0, 32.0, 26.5, 18.5, 16.5, 17.0, 18.…
# how to calculate the number of distinct values across many columns
mpg %>%
  dplyr::summarise(
    across(
      .cols = everything(),
      .fns = n_distinct
    )
  )
## # A tibble: 1 × 11
##   manufacturer model displ  year   cyl trans   drv   cty   hwy    fl class
##          <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1           15    38    35     2     4    10     3    21    27     5     7
# how to count with continuous variables
starwars %>%
  count(
    decade = 10 * (birth_year %/% 10),
    name = "characters_per_decade"
  ) %>%
  glimpse()
## Rows: 16
## Columns: 2
## $ decade                <dbl> 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110,…
## $ characters_per_decade <int> 1, 3, 4, 4, 9, 6, 4, 2, 2, 3, 1, 1, 1, 1, 1, 44
# summarise() across many columns
gasoline %>%
  group_by(country) %>%
  dplyr::summarise(across(starts_with("l"), mean))
## # A tibble: 18 × 5
##    country  lgaspcar lincomep   lrpmg lcarpcap
##    <chr>       <dbl>    <dbl>   <dbl>    <dbl>
##  1 austria      4.06    -6.12 -0.486     -8.85
##  2 belgium      3.92    -5.85 -0.326     -8.63
##  3 canada       4.86    -5.58 -1.05      -8.08
##  4 denmark      4.19    -5.76 -0.358     -8.58
##  5 france       3.82    -5.87 -0.253     -8.45
##  6 germany      3.89    -5.85 -0.517     -8.51
##  7 greece       4.88    -6.61 -0.0339   -10.8 
##  8 ireland      4.23    -6.44 -0.348     -9.04
##  9 italy        3.73    -6.35 -0.152     -8.83
## 10 japan        4.70    -6.25 -0.287     -9.95
## 11 netherla     4.08    -5.92 -0.370     -8.82
## 12 norway       4.11    -5.75 -0.278     -8.77
## 13 spain        4.06    -5.63  0.739     -9.90
## 14 sweden       4.01    -7.82 -2.71      -8.25
## 15 switzerl     4.24    -5.93 -0.902     -8.54
## 16 turkey       5.77    -7.34 -0.422    -12.5 
## 17 u.k.         3.98    -6.02 -0.459     -8.55
## 18 u.s.a.       4.82    -5.45 -1.21      -7.78
# applying several functions to many columns at once
gasoline %>%
  group_by(country) %>%
  dplyr::summarise(across(starts_with("l"), tibble::lst(mean, sd, max, min), .names = "{fn}_{col}"))
## # A tibble: 18 × 17
##    country  mean_lgasp…¹ sd_lg…² max_l…³ min_l…⁴ mean_…⁵ sd_li…⁶ max_l…⁷ min_l…⁸
##    <chr>           <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 austria          4.06  0.0693    4.20    3.92   -6.12   0.235   -5.76   -6.47
##  2 belgium          3.92  0.103     4.16    3.82   -5.85   0.227   -5.53   -6.22
##  3 canada           4.86  0.0262    4.90    4.81   -5.58   0.193   -5.31   -5.89
##  4 denmark          4.19  0.158     4.50    4.00   -5.76   0.176   -5.48   -6.06
##  5 france           3.82  0.0499    3.91    3.75   -5.87   0.241   -5.53   -6.26
##  6 germany          3.89  0.0239    3.93    3.85   -5.85   0.193   -5.56   -6.16
##  7 greece           4.88  0.255     5.38    4.48   -6.61   0.331   -6.15   -7.16
##  8 ireland          4.23  0.0437    4.33    4.16   -6.44   0.162   -6.19   -6.72
##  9 italy            3.73  0.220     4.05    3.38   -6.35   0.217   -6.08   -6.73
## 10 japan            4.70  0.684     6.00    3.95   -6.25   0.425   -5.71   -6.99
## 11 netherla         4.08  0.286     4.65    3.71   -5.92   0.193   -5.66   -6.22
## 12 norway           4.11  0.123     4.44    3.96   -5.75   0.201   -5.42   -6.09
## 13 spain            4.06  0.317     4.75    3.62   -5.63   0.278   -5.29   -6.17
## 14 sweden           4.01  0.0364    4.07    3.91   -7.82   0.126   -7.67   -8.07
## 15 switzerl         4.24  0.102     4.44    4.05   -5.93   0.124   -5.75   -6.16
## 16 turkey           5.77  0.329     6.16    5.14   -7.34   0.331   -6.89   -7.84
## 17 u.k.             3.98  0.0479    4.10    3.91   -6.02   0.107   -5.84   -6.19
## 18 u.s.a.           4.82  0.0219    4.86    4.79   -5.45   0.148   -5.22   -5.70
## # … with 8 more variables: mean_lrpmg <dbl>, sd_lrpmg <dbl>, max_lrpmg <dbl>,
## #   min_lrpmg <dbl>, mean_lcarpcap <dbl>, sd_lcarpcap <dbl>,
## #   max_lcarpcap <dbl>, min_lcarpcap <dbl>, and abbreviated variable names
## #   ¹​mean_lgaspcar, ²​sd_lgaspcar, ³​max_lgaspcar, ⁴​min_lgaspcar, ⁵​mean_lincomep,
## #   ⁶​sd_lincomep, ⁷​max_lincomep, ⁸​min_lincomep
gasoline %>%
  group_by(country) %>%
  dplyr::summarise(across(dplyr::contains("car"), tibble::lst(mean, sd, max, min), .names = "{fn}_{col}"))
## # A tibble: 18 × 9
##    country  mean_lgasp…¹ sd_lg…² max_l…³ min_l…⁴ mean_…⁵ sd_lc…⁶ max_l…⁷ min_l…⁸
##    <chr>           <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 austria          4.06  0.0693    4.20    3.92   -8.85   0.473   -8.21   -9.77
##  2 belgium          3.92  0.103     4.16    3.82   -8.63   0.417   -8.10   -9.41
##  3 canada           4.86  0.0262    4.90    4.81   -8.08   0.195   -7.77   -8.38
##  4 denmark          4.19  0.158     4.50    4.00   -8.58   0.349   -8.20   -9.33
##  5 france           3.82  0.0499    3.91    3.75   -8.45   0.344   -8.01   -9.15
##  6 germany          3.89  0.0239    3.93    3.85   -8.51   0.406   -7.95   -9.34
##  7 greece           4.88  0.255     5.38    4.48  -10.8    0.839   -9.57  -12.2 
##  8 ireland          4.23  0.0437    4.33    4.16   -9.04   0.345   -8.55   -9.70
##  9 italy            3.73  0.220     4.05    3.38   -8.83   0.639   -8.11  -10.1 
## 10 japan            4.70  0.684     6.00    3.95   -9.95   1.20    -8.59  -12.2 
## 11 netherla         4.08  0.286     4.65    3.71   -8.82   0.617   -8.16  -10.0 
## 12 norway           4.11  0.123     4.44    3.96   -8.77   0.438   -8.17   -9.68
## 13 spain            4.06  0.317     4.75    3.62   -9.90   0.960   -8.63  -11.6 
## 14 sweden           4.01  0.0364    4.07    3.91   -8.25   0.242   -7.96   -8.74
## 15 switzerl         4.24  0.102     4.44    4.05   -8.54   0.378   -8.03   -9.26
## 16 turkey           5.77  0.329     6.16    5.14  -12.5    0.751  -11.2   -13.5 
## 17 u.k.             3.98  0.0479    4.10    3.91   -8.55   0.281   -8.26   -9.12
## 18 u.s.a.           4.82  0.0219    4.86    4.79   -7.78   0.162   -7.54   -8.02
## # … with abbreviated variable names ¹​mean_lgaspcar, ²​sd_lgaspcar,
## #   ³​max_lgaspcar, ⁴​min_lgaspcar, ⁵​mean_lcarpcap, ⁶​sd_lcarpcap, ⁷​max_lcarpcap,
## #   ⁸​min_lcarpcap
gasoline %>%
  group_by(country) %>%
  dplyr::summarise(across(is.numeric, tibble::lst(mean, sd, min, max), .names = "{fn}_{col}"))
## # A tibble: 18 × 17
##    country  mean_lgasp…¹ sd_lg…² min_l…³ max_l…⁴ mean_…⁵ sd_li…⁶ min_l…⁷ max_l…⁸
##    <chr>           <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 austria          4.06  0.0693    3.92    4.20   -6.12   0.235   -6.47   -5.76
##  2 belgium          3.92  0.103     3.82    4.16   -5.85   0.227   -6.22   -5.53
##  3 canada           4.86  0.0262    4.81    4.90   -5.58   0.193   -5.89   -5.31
##  4 denmark          4.19  0.158     4.00    4.50   -5.76   0.176   -6.06   -5.48
##  5 france           3.82  0.0499    3.75    3.91   -5.87   0.241   -6.26   -5.53
##  6 germany          3.89  0.0239    3.85    3.93   -5.85   0.193   -6.16   -5.56
##  7 greece           4.88  0.255     4.48    5.38   -6.61   0.331   -7.16   -6.15
##  8 ireland          4.23  0.0437    4.16    4.33   -6.44   0.162   -6.72   -6.19
##  9 italy            3.73  0.220     3.38    4.05   -6.35   0.217   -6.73   -6.08
## 10 japan            4.70  0.684     3.95    6.00   -6.25   0.425   -6.99   -5.71
## 11 netherla         4.08  0.286     3.71    4.65   -5.92   0.193   -6.22   -5.66
## 12 norway           4.11  0.123     3.96    4.44   -5.75   0.201   -6.09   -5.42
## 13 spain            4.06  0.317     3.62    4.75   -5.63   0.278   -6.17   -5.29
## 14 sweden           4.01  0.0364    3.91    4.07   -7.82   0.126   -8.07   -7.67
## 15 switzerl         4.24  0.102     4.05    4.44   -5.93   0.124   -6.16   -5.75
## 16 turkey           5.77  0.329     5.14    6.16   -7.34   0.331   -7.84   -6.89
## 17 u.k.             3.98  0.0479    3.91    4.10   -6.02   0.107   -6.19   -5.84
## 18 u.s.a.           4.82  0.0219    4.79    4.86   -5.45   0.148   -5.70   -5.22
## # … with 8 more variables: mean_lrpmg <dbl>, sd_lrpmg <dbl>, min_lrpmg <dbl>,
## #   max_lrpmg <dbl>, mean_lcarpcap <dbl>, sd_lcarpcap <dbl>,
## #   min_lcarpcap <dbl>, max_lcarpcap <dbl>, and abbreviated variable names
## #   ¹​mean_lgaspcar, ²​sd_lgaspcar, ³​min_lgaspcar, ⁴​max_lgaspcar, ⁵​mean_lincomep,
## #   ⁶​sd_lincomep, ⁷​min_lincomep, ⁸​max_lincomep
gasoline %>%
  select(-year) %>%
  group_by(country) %>%
  dplyr::summarise(across(everything(), tibble::lst(mean, sd, min, max), .names = "{fn}_{col}"))
## # A tibble: 18 × 17
##    country  mean_lgasp…¹ sd_lg…² min_l…³ max_l…⁴ mean_…⁵ sd_li…⁶ min_l…⁷ max_l…⁸
##    <chr>           <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 austria          4.06  0.0693    3.92    4.20   -6.12   0.235   -6.47   -5.76
##  2 belgium          3.92  0.103     3.82    4.16   -5.85   0.227   -6.22   -5.53
##  3 canada           4.86  0.0262    4.81    4.90   -5.58   0.193   -5.89   -5.31
##  4 denmark          4.19  0.158     4.00    4.50   -5.76   0.176   -6.06   -5.48
##  5 france           3.82  0.0499    3.75    3.91   -5.87   0.241   -6.26   -5.53
##  6 germany          3.89  0.0239    3.85    3.93   -5.85   0.193   -6.16   -5.56
##  7 greece           4.88  0.255     4.48    5.38   -6.61   0.331   -7.16   -6.15
##  8 ireland          4.23  0.0437    4.16    4.33   -6.44   0.162   -6.72   -6.19
##  9 italy            3.73  0.220     3.38    4.05   -6.35   0.217   -6.73   -6.08
## 10 japan            4.70  0.684     3.95    6.00   -6.25   0.425   -6.99   -5.71
## 11 netherla         4.08  0.286     3.71    4.65   -5.92   0.193   -6.22   -5.66
## 12 norway           4.11  0.123     3.96    4.44   -5.75   0.201   -6.09   -5.42
## 13 spain            4.06  0.317     3.62    4.75   -5.63   0.278   -6.17   -5.29
## 14 sweden           4.01  0.0364    3.91    4.07   -7.82   0.126   -8.07   -7.67
## 15 switzerl         4.24  0.102     4.05    4.44   -5.93   0.124   -6.16   -5.75
## 16 turkey           5.77  0.329     5.14    6.16   -7.34   0.331   -7.84   -6.89
## 17 u.k.             3.98  0.0479    3.91    4.10   -6.02   0.107   -6.19   -5.84
## 18 u.s.a.           4.82  0.0219    4.79    4.86   -5.45   0.148   -5.70   -5.22
## # … with 8 more variables: mean_lrpmg <dbl>, sd_lrpmg <dbl>, min_lrpmg <dbl>,
## #   max_lrpmg <dbl>, mean_lcarpcap <dbl>, sd_lcarpcap <dbl>,
## #   min_lcarpcap <dbl>, max_lcarpcap <dbl>, and abbreviated variable names
## #   ¹​mean_lgaspcar, ²​sd_lgaspcar, ³​min_lgaspcar, ⁴​max_lgaspcar, ⁵​mean_lincomep,
## #   ⁶​sd_lincomep, ⁷​min_lincomep, ⁸​max_lincomep
# creating bins for continuous variables
starwars %>%
  count(height_intervals = cut_width(height, 10))
## # A tibble: 18 × 2
##    height_intervals     n
##    <fct>            <int>
##  1 [65,75]              1
##  2 (75,85]              1
##  3 (85,95]              2
##  4 (95,105]             3
##  5 (105,115]            1
##  6 (115,125]            1
##  7 (135,145]            1
##  8 (145,155]            2
##  9 (155,165]            7
## 10 (165,175]           14
## 11 (175,185]           20
## 12 (185,195]           12
## 13 (195,205]            7
## 14 (205,215]            3
## 15 (215,225]            2
## 16 (225,235]            3
## 17 (255,265]            1
## 18 <NA>                 6
# calculating the sum of a variable based on groups
# using group_by() and summarise()
economics %>%
  mutate(
    year = format(date, "%Y")) %>%
  group_by(year) %>%
  dplyr::summarise(sum_unemploy = sum(unemploy, na.rm = TRUE))
## # A tibble: 49 × 2
##    year  sum_unemploy
##    <chr>        <dbl>
##  1 1967         18074
##  2 1968         33569
##  3 1969         33962
##  4 1970         49528
##  5 1971         60260
##  6 1972         58510
##  7 1973         52312
##  8 1974         62080
##  9 1975         95275
## 10 1976         88778
## # … with 39 more rows
# using count()
economics %>%
  count(year = format(date, "%Y"), wt = unemploy, name = "sum_unemploy")
## # A tibble: 49 × 2
##    year  sum_unemploy
##    <chr>        <dbl>
##  1 1967         18074
##  2 1968         33569
##  3 1969         33962
##  4 1970         49528
##  5 1971         60260
##  6 1972         58510
##  7 1973         52312
##  8 1974         62080
##  9 1975         95275
## 10 1976         88778
## # … with 39 more rows
# adding counts as a variable to your data frame with add_count()
mpg %>%
  add_count(manufacturer, name = "number_of_cars_by_manufacturer") %>%
  select(manufacturer, model, number_of_cars_by_manufacturer) %>%
  glimpse()
## Rows: 234
## Columns: 3
## $ manufacturer                   <chr> "audi", "audi", "audi", "audi", "audi",…
## $ model                          <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4…
## $ number_of_cars_by_manufacturer <int> 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,…
# adding a new variable to your data frame that contains the sum of a specific variable
mpg %>%
  group_by(model) %>%
  add_tally(wt = displ, name = "sum_display_per_model") %>%
  select(manufacturer, model, sum_display_per_model) %>%
  glimpse()
## Rows: 234
## Columns: 3
## Groups: model [38]
## $ manufacturer          <chr> "audi", "audi", "audi", "audi", "audi", "audi", …
## $ model                 <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 qu…
## $ sum_display_per_model <dbl> 16.3, 16.3, 16.3, 16.3, 16.3, 16.3, 16.3, 19.4, …

10.3 Packages

To count the number of rows within a group the janitor’s tabyl() function makes it really easy. While tabyl() counted rows by group, janitor’s adorn_percentages() function will calculate percentages in a data frame, allowing you to choose whether the denominator for dividing each item should be a sum by row or col. If you’d like the results to look like conventional percents – multiplied by a hundred and rounded with the percent sign included – add the adorn_pct_formatting() function. However, this will turn the percents into character strings because of the percent sign, which R doesn’t recognize as part of a number. So only use that formatting if you don’t need the data as numbers in your data frame. The syntax for calculating percents that are non-rounded fractions: adorn_percentages(mydf, denominator = "col") for calculating by column. adorn_percentages(mydf) defaults by row.

# using janitor::tabyl() function to count number of rows within a group
contributions <- map_df(list.files("input/mayor_finance_reports", full.names = TRUE), rio::import) %>%
  filter(City == "Framingham", !str_detect(tolower(Address), "box")) %>%
  distinct(Contributor, Address, .keep_all = TRUE) %>%
  tabyl(Recipient, sort = TRUE) %>%
  # mutate(percent = round(percent * 100, 1)) %>% 
  select(Candidate = Recipient, Pct_Local_Contributors = percent)
contributions
##                   Candidate Pct_Local_Contributors
##       Horrigan, Joshua Paul            0.035820896
##  Neves-Grigg, Sr., Benjaman            0.011940299
##                 Sen, Dhruba            0.008955224
##             Sousa, Priscila            0.029850746
##       Spicer, Dr. Yvonne M.            0.516417910
##          Stefanini, John A.            0.337313433
##             Tilden, Mark S.            0.059701493
# using adorn_percentages()
results <- readr::read_csv("input/election_framingham_mayor_2017_09.csv", col_names = TRUE) %>% 
  dplyr::select(Candidate, Totals)
results
## # A tibble: 9 × 2
##   Candidate                Totals
##   <chr>                     <dbl>
## 1 Blanks                       56
## 2 Joshua Paul Horrigan        545
## 3 John A. Stefanini          3184
## 4 Dhruba P. Sen               101
## 5 Mark S. Tilden              439
## 6 Yvonne M. Spicer           5967
## 7 Benjaman A. Neves-Grigg,    134
## 8 Priscila Sousa              538
## 9 Write-Ins                    42
results <- results %>%
  filter(!(Candidate %in% c("Blanks", "Write-Ins"))) %>%
  adorn_percentages(denominator = "col") %>% 
   rename(Pct_Vote = Totals)
results
##                 Candidate    Pct_Vote
##      Joshua Paul Horrigan 0.049963330
##         John A. Stefanini 0.291895856
##             Dhruba P. Sen 0.009259259
##            Mark S. Tilden 0.040245691
##          Yvonne M. Spicer 0.547029703
##  Benjaman A. Neves-Grigg, 0.012284562
##            Priscila Sousa 0.049321599

11 Web scraping

11.1 Packages

Web scraping process:

  1. Check to make sure the site hasn’t blocked automated retrieval.
  2. Get a list of all the links we want, with the help of a Chrome extension called SelectorGadget and the rvest R package.
  3. Download the files.

11.1.1 Step 1: Follow the rules with robotstxt

It’s a World Wide Web convention that if a site wants to restrict automated bots from “crawling” its pages – either all pages or just some – it posts those details in a robots.txt file in its root directory. So, for a site at www.thesiteurl.com, robots.txt can be found at http://thesiteurl.com/robots.txt (or, in many cases, http://www.thesiteurl.com/robots.txt or https://www.thesiteurl.com/robots.txt).

To be a responsible and considerate Internet citizen, you should make sure a site hasn’t refused bots and scripts from accessing its pages before starting to scrape. You can look at these files manually – for example, checking RStudio’s robots.txt file by going to https://wwwrstudio.com/robots.txt in a browser. But it’s more elegant – and automated – to use the robotstxt package to check for you. Plus, each time you run a scraper (if it’s one you want to use more than once), you’ll be sure the site’s robots.txt hasn’t changed to exclude you. An easy way to do this is using the package robotstxt and the paths_allowed() function.

11.1.3 Step 3: Download files

Finally, we’d like to download the files at each of those links. We can do that by applying base R’s download.file() function. However, download.file() requires both a url (which we have) and a file name (which we don’t have yet), using the syntax download.file(myurl, myfilename). purrr can help. It would be easiest, but pretty clunky, to use the URL as the file name. We’ll be happier later if we extract a file name from the URL and use that for the name. Fortunately, base R has a function to do just that: basename(). We want to apply the basename() function to all the URLs in my_urls and save the results into an R vector of character strings. purrr’s map_chr() will do just that.

Ideally, I’d like to use the vector of URLs and the vector of file names when downloading files, so each URL is downloaded to a file with the appropriate file name. In other words: download.file(my_urls[1], my_filenames[1]), download.file(my_urls[2], my_filenames[2]), and so on. Both walk() and the map() family have sister functions designed to do just that: apply a function to two data sets at a time, one by one. For walk, it’s walk2(). For map, it’s map2(), map2_df(),map2_chr(), and so on.

In this case, we want to download the files, but the download.file() function itself saves the file – there’s no additional value we want to store. So, a walk() option is the better choice. walk2() uses the syntax walk2(myfirstvector, mysecondvector, myfunction) to apply myfunction like myfunction(myfirstvector[1], mysecondvector[1]), myfunction(myfirstvector[2], mysecondvector[2]), myfunction(myfirstvector[3], mysecondvector[3]), and so on. This code applies download.file() to the URLs and file names one by one in tandem, with the URLs as first argument and file names as the second argument.

# 1. Checking
# checking robotstxt
paths_allowed("https://www.rstudio.com/resources/cheatsheets/")
## [1] TRUE
# 2. Getting a list of all links we want
# selecting CSS (using previosly SelectorGadget!)
my_css <- "p .btn-primary"
# reading HTML page
my_html <- rvest::read_html("https://www.rstudio.com/resources/cheatsheets/")
# extracting the portion of the we want using CSS selectors
my_nodes <- rvest::html_nodes(my_html, my_css)
# all the items saved
head(my_nodes)
## {xml_nodeset (6)}
## [1] <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/data- ...
## [2] <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/data- ...
## [3] <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/tidyr ...
## [4] <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/data- ...
## [5] <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/purrr ...
## [6] <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/strin ...
# looking the items in the list
my_nodes[[2]]
## {html_node}
## <a href="https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-transformation.pdf" class="btn btn-primary">
# extracting the href attribute
my_urls <- rvest::html_nodes(my_html, my_css) %>%
rvest::html_attr('href')
head(my_urls)
## [1] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-visualization.pdf" 
## [2] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-transformation.pdf"
## [3] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/tidyr.pdf"              
## [4] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-import.pdf"        
## [5] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/purrr.pdf"              
## [6] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/strings.pdf"
# looking at the first item on the list
my_urls[2]
## [1] "https://raw.githubusercontent.com/rstudio/cheatsheets/main/data-transformation.pdf"
# extracting the htext attribute
my_nodes_text <- rvest::html_nodes(my_html, my_css) %>%
  rvest::html_text()
# looking at the first text
my_nodes_text[2]
## [1] "Download"
# 3. Downloading files
# getting the file names
my_filenames <- map_chr(my_urls, basename)
# applying a function to two vectors at a time
# walk2(my_urls, my_filenames, download.file)

# for loop option
# for (i in seq_along(1:length(my_urls)) {
#   download.file(my_urls[i], my_filenames[i])
# }

# another example
url <- "https://en.wikipedia.org/wiki/List_of_Nobel_laureates"
wiki <- rvest::read_html(url)
tables <- rvest::html_nodes(wiki, "table")
tables
## {xml_nodeset (6)}
## [1] <table class="wikitable sortable"><tbody>\n<tr>\n<th>Year\n</th>\n<th wid ...
## [2] <table class="nowraplinks hlist mw-collapsible mw-collapsed navbox-inner" ...
## [3] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbod ...
## [4] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbod ...
## [5] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbod ...
## [6] <table class="nowraplinks mw-collapsible autocollapse navbox-inner" style ...
laureates <- rvest::html_table(tables[[1]], fill = TRUE)
head(laureates)
## # A tibble: 6 × 7
##   Year  Physics                            Chemi…¹ Physi…² Liter…³ Peace Econo…⁴
##   <chr> <chr>                              <chr>   <chr>   <chr>   <chr> <chr>  
## 1 1901  Wilhelm Röntgen                    Jacobu… Emil A… Sully … Henr… —      
## 2 1902  Hendrik Lorentz;Pieter Zeeman      Herman… Ronald… Theodo… Élie… —      
## 3 1903  Henri Becquerel;Pierre Curie;Mari… Svante… Niels … Bjørns… Rand… —      
## 4 1904  Lord Rayleigh                      Willia… Ivan P… Frédér… Inst… —      
## 5 1905  Philipp Lenard                     Adolf … Robert… Henryk… Bert… —      
## 6 1906  J. J. Thomson                      Henri … Camill… Giosuè… Theo… —      
## # … with abbreviated variable names ¹​Chemistry, ²​`Physiologyor Medicine`,
## #   ³​Literature, ⁴​`Economics(The Sveriges Riksbank Prize)[13][a]`

12 Linting

The code in this RMarkdown is linted with the lintr package, which is based on the tidyverse style guide.

# lintr::lint("main.Rmd", linters =
#               lintr::with_defaults(
#                 commented_code_linter = NULL,
#                 trailing_whitespace_linter = NULL
#                 )
#             )
# if you have additional scripts and want them to be linted too, add them here
# lintr::lint("scripts/my_script.R")